Fix spurious html tags like <shrug>

* Only render HTML tags in markdown if they're del tags
 * Consider non-allowed HTML tags as plain text nodes, so
   a message of just '<shrug>' doesn't need to be sent as
   HTML
 * Consequently rewrite isPlaintext to just look at the parse
   tree rather than making and gutting a renderer to walk
   the tree (now we're using a library that actually produces
   a meaningfgul parse tree).
 * Tweak when we put \n on text output to avoid putting \n on
   the end of messages.

Fixes https://github.com/vector-im/riot-web/issues/3065
This commit is contained in:
David Baker 2017-02-02 14:17:07 +00:00
parent 63e47d8677
commit 853c89dfdc

View file

@ -15,6 +15,45 @@ limitations under the License.
*/ */
import commonmark from 'commonmark'; import commonmark from 'commonmark';
import escape from 'lodash/escape';
const ALLOWED_HTML_TAGS = ['del'];
// These types of node are definitely text
const TEXT_NODES = ['text', 'softbreak', 'linebreak', 'paragraph', 'document'];
function is_allowed_html_tag(node) {
// Regex won't work for tags with attrs, but we only
// allow <del> anyway.
const matches = /^<\/?(.*)>$/.exec(node.literal);
if (matches && matches.length == 2) {
const tag = matches[1];
return ALLOWED_HTML_TAGS.indexOf(tag) > -1;
}
return false;
}
function html_if_tag_allowed(node) {
if (is_allowed_html_tag(node)) {
this.lit(node.literal);
return;
} else {
this.lit(escape(node.literal));
}
}
/*
* Returns true if the parse output containing the node
* comprises multiple block level elements (ie. lines),
* or false if it is only a single line.
*/
function is_multi_line(node) {
var par = node;
while (par.parent) {
par = par.parent;
}
return par.firstChild != par.lastChild;
}
/** /**
* Class that wraps commonmark, adding the ability to see whether * Class that wraps commonmark, adding the ability to see whether
@ -30,29 +69,26 @@ export default class Markdown {
} }
isPlainText() { isPlainText() {
// we determine if the message requires markdown by const walker = this.parsed.walker();
// running the parser on the tokens with a dummy
// rendered and seeing if any of the renderer's
// functions are called other than those noted below.
// TODO: can't we just examine the output of the parser?
let is_plain = true;
function setNotPlain() { let ev;
is_plain = false; while ( (ev = walker.next()) ) {
const node = ev.node;
if (TEXT_NODES.indexOf(node.type) > -1) {
// definitely text
continue;
} else if (node.type == 'html_inline' || node.type == 'html_block') {
// if it's an allowed html tag, we need to render it and therefore
// we will need to use HTML. If it's not allowed, it's not HTML since
// we'll just be treating it as text.
if (is_allowed_html_tag(node)) {
return false;
}
} else {
return false;
}
} }
return true;
const dummy_renderer = new commonmark.HtmlRenderer();
for (const k of Object.keys(commonmark.HtmlRenderer.prototype)) {
dummy_renderer[k] = setNotPlain;
}
// text and paragraph are just text
dummy_renderer.text = function(t) { return t; };
dummy_renderer.softbreak = function(t) { return t; };
dummy_renderer.paragraph = function(t) { return t; };
dummy_renderer.render(this.parsed);
return is_plain;
} }
toHTML() { toHTML() {
@ -65,20 +101,27 @@ export default class Markdown {
// 'inline', rather than unnecessarily wrapped in its own // 'inline', rather than unnecessarily wrapped in its own
// p tag. If, however, we have multiple nodes, each gets // p tag. If, however, we have multiple nodes, each gets
// its own p tag to keep them as separate paragraphs. // its own p tag to keep them as separate paragraphs.
var par = node; if (is_multi_line(node)) {
while (par.parent) {
par = par.parent;
}
if (par.firstChild != par.lastChild) {
real_paragraph.call(this, node, entering); real_paragraph.call(this, node, entering);
} }
}; };
renderer.html_inline = html_if_tag_allowed;
renderer.html_block = function(node) {
// as with `paragraph`, we only insert line breaks
// if there are multiple lines in the markdown.
const isMultiLine = is_multi_line(node);
if (isMultiLine) this.cr();
html_if_tag_allowed.call(this, node);
if (isMultiLine) this.cr();
}
return renderer.render(this.parsed); return renderer.render(this.parsed);
} }
/* /*
* Render the mrkdown message to plain text. That is, essentially * Render the markdown message to plain text. That is, essentially
* just remove any backslashes escaping what would otherwise be * just remove any backslashes escaping what would otherwise be
* markdown syntax * markdown syntax
* (to fix https://github.com/vector-im/riot-web/issues/2870) * (to fix https://github.com/vector-im/riot-web/issues/2870)
@ -96,22 +139,18 @@ export default class Markdown {
}; };
renderer.paragraph = function(node, entering) { renderer.paragraph = function(node, entering) {
// If there is only one top level node, just return the // as with toHTML, only append lines to paragraphs if there are
// bare text: it's a single line of text and so should be // multiple paragraphs
// 'inline', rather than unnecessarily wrapped in its own if (is_multi_line(node)) {
// p tag. If, however, we have multiple nodes, each gets if (!entering && node.next) {
// its own p tag to keep them as separate paragraphs.
var par = node;
while (par.parent) {
node = par;
par = par.parent;
}
if (node != par.lastChild) {
if (!entering) {
this.lit('\n\n'); this.lit('\n\n');
} }
} }
}; };
renderer.html_block = function(node) {
this.lit(node.literal);
if (is_multi_line(node) && node.next) this.lit('\n\n');
}
return renderer.render(this.parsed); return renderer.render(this.parsed);
} }