feat(events): add plain-text body representation from HTML

Co-Authored-By: Sorunome <mail@sorunome.de>
This commit is contained in:
xenofem 2021-04-18 00:37:04 -04:00 committed by Sorunome
parent f0dd8ca061
commit a1b95c0915
No known key found for this signature in database
GPG Key ID: B19471D07FC9BE9C
5 changed files with 363 additions and 0 deletions

View File

@ -27,6 +27,7 @@ import 'utils/matrix_localizations.dart';
import 'utils/receipt.dart';
import 'utils/event_localizations.dart';
import 'utils/crypto/encrypted_file.dart';
import 'utils/html_to_text.dart';
abstract class RelationshipTypes {
static const String reply = 'm.in_reply_to';
@ -286,6 +287,12 @@ class Event extends MatrixEvent {
return '$type';
}
/// Use this to get a plain-text representation of the event, stripping things
/// like spoilers and thelike. Useful for plain text notifications.
String get plaintextBody => content['format'] == 'org.matrix.custom.html'
? HtmlToText.convert(formattedText)
: body;
/// Returns a list of [Receipt] instances for this event.
List<Receipt> get receipts {
if (!(room.roomAccountData.containsKey('m.receipt'))) return [];

View File

@ -0,0 +1,243 @@
/*
* Famedly Matrix SDK
* Copyright (C) 2021 Famedly GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import 'package:html/parser.dart';
import 'package:html/dom.dart';
import 'package:html_unescape/html_unescape.dart';
class HtmlToText {
/// Convert an HTML string to a pseudo-markdown plain text representation, with
/// `data-mx-spoiler` spans redacted
static String convert(String html) {
final opts = _ConvertOpts();
var reply = _walkNode(opts, parseFragment(html));
reply = reply.replaceAll(RegExp(r'\s*$', multiLine: false), '');
return reply;
}
static String _parsePreContent(_ConvertOpts opts, Element node) {
var text = node.innerHtml;
final match =
RegExp(r'^<code([^>]*)>', multiLine: false, caseSensitive: false)
.firstMatch(text);
if (match == null) {
text = HtmlUnescape().convert(text);
if (text[0] != '\n') {
text = '\n$text';
}
if (text[text.length - 1] != '\n') {
text += '\n';
}
return text;
}
// remove <code> opening tag
text = text.substring(match.end);
// remove the </code> closing tag
text = text.replaceAll(
RegExp(r'</code>$', multiLine: false, caseSensitive: false), '');
text = HtmlUnescape().convert(text);
if (text[0] != '\n') {
text = '\n$text';
}
if (text[text.length - 1] != '\n') {
text += '\n';
}
final language =
RegExp(r'language-(\w+)', multiLine: false, caseSensitive: false)
.firstMatch(match.group(1));
if (language != null) {
text = language.group(1) + text;
}
return text;
}
static String _parseBlockquoteContent(_ConvertOpts opts, Element node) {
final msg = _walkChildNodes(opts, node);
return msg.split('\n').map((s) => '> $s').join('\n') + '\n';
}
static String _parseSpanContent(_ConvertOpts opts, Element node) {
final content = _walkChildNodes(opts, node);
if (node.attributes['data-mx-spoiler'] is String) {
var spoiler = '' * content.length;
final reason = node.attributes['data-mx-spoiler'];
if (reason != '') {
spoiler = '($reason) $spoiler';
}
return spoiler;
}
return content;
}
static String _parseUlContent(_ConvertOpts opts, Element node) {
opts.listDepth++;
final entries = _listChildNodes(opts, node, {'li'});
opts.listDepth--;
final bulletPoint =
_listBulletPoints[opts.listDepth % _listBulletPoints.length];
return entries
.map((s) =>
(' ' * opts.listDepth) +
bulletPoint +
' ' +
s.replaceAll('\n', '\n' + (' ' * opts.listDepth) + ' '))
.join('\n');
}
static String _parseOlContent(_ConvertOpts opts, Element node) {
opts.listDepth++;
final entries = _listChildNodes(opts, node, {'li'});
opts.listDepth--;
var entry = 0;
if (node.attributes['start'] is String &&
RegExp(r'^[0-9]+$', multiLine: false)
.hasMatch(node.attributes['start'])) {
entry = int.parse(node.attributes['start']);
}
return entries.map((s) {
entry++;
return (' ' * opts.listDepth) +
'$entry. ' +
s.replaceAll('\n', '\n' + (' ' * opts.listDepth) + ' ');
}).join('\n');
}
static const _listBulletPoints = <String>['', '', '', ''];
static List<String> _listChildNodes(_ConvertOpts opts, Element node,
[Iterable<String> types]) {
final replies = <String>[];
for (final child in node.nodes) {
if (types != null &&
types.isNotEmpty &&
((child is Text) ||
((child is Element) &&
!types.contains(child.localName.toLowerCase())))) {
continue;
}
replies.add(_walkNode(opts, child));
}
return replies;
}
static const _blockTags = <String>{
'blockquote',
'ul',
'ol',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'pre',
};
static String _walkChildNodes(_ConvertOpts opts, Node node) {
var reply = '';
var lastTag = '';
for (final child in node.nodes) {
final thisTag = child is Element ? child.localName.toLowerCase() : '';
if (thisTag == 'p' && lastTag == 'p') {
reply += '\n\n';
} else if (_blockTags.contains(thisTag) &&
reply.isNotEmpty &&
reply[reply.length - 1] != '\n') {
reply += '\n';
}
reply += _walkNode(opts, child);
if (thisTag.isNotEmpty) {
lastTag = thisTag;
}
}
return reply;
}
static String _walkNode(_ConvertOpts opts, Node node) {
if (node is Text) {
// ignore \n between single nodes
return node.text == '\n' ? '' : node.text;
} else if (node is Element) {
final tag = node.localName.toLowerCase();
switch (tag) {
case 'em':
case 'i':
return '*${_walkChildNodes(opts, node)}*';
case 'strong':
case 'b':
return '**${_walkChildNodes(opts, node)}**';
case 'u':
case 'ins':
return '__${_walkChildNodes(opts, node)}__';
case 'del':
case 'strike':
case 's':
return '~~${_walkChildNodes(opts, node)}~~';
case 'code':
return '`${node.text}`';
case 'pre':
return '```${_parsePreContent(opts, node)}```\n';
case 'a':
final href = node.attributes['href'] ?? '';
final content = _walkChildNodes(opts, node);
if (href.toLowerCase().startsWith('https://matrix.to/#/') ||
href.toLowerCase().startsWith('matrix:')) {
return content;
}
return '🔗$content';
case 'img':
return node.attributes['alt'] ??
node.attributes['title'] ??
node.attributes['src'] ??
'';
case 'br':
return '\n';
case 'blockquote':
return _parseBlockquoteContent(opts, node);
case 'ul':
return _parseUlContent(opts, node);
case 'ol':
return _parseOlContent(opts, node);
case 'mx-reply':
return '';
case 'hr':
return '\n----------\n';
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
final mark = '#' * int.parse(tag[1]);
return '$mark ${_walkChildNodes(opts, node)}\n';
case 'span':
return _parseSpanContent(opts, node);
default:
return _walkChildNodes(opts, node);
}
} else {
return _walkChildNodes(opts, node);
}
}
}
class _ConvertOpts {
int listDepth = 0;
}

View File

@ -21,6 +21,7 @@ dependencies:
ffi: ^1.0.0
js: ^0.6.3
slugify: ^2.0.0
html: ^0.15.0
dev_dependencies:
pedantic: ^1.11.0

View File

@ -945,6 +945,20 @@ void main() {
expect(
event.aggregatedEvents(timeline, RelationshipTypes.edit), <Event>{});
});
test('plaintextBody', () {
final event = Event.fromJson({
'type': EventTypes.Message,
'content': {
'body': 'blah',
'msgtype': 'm.text',
'format': 'org.matrix.custom.html',
'formatted_body': '<b>blah</b>',
},
'event_id': '\$source',
'sender': '@alice:example.org',
}, null);
expect(event.plaintextBody, '**blah**');
});
test('getDisplayEvent', () {
var event = Event.fromJson({
'type': EventTypes.Message,

View File

@ -0,0 +1,98 @@
/*
* Famedly Matrix SDK
* Copyright (C) 2021 Famedly GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import 'package:matrix/src/utils/html_to_text.dart';
import 'package:test/test.dart';
void main() {
group('htmlToText', () {
test('stuff', () async {
final testMap = <String, String>{
'': '',
'hello world\nthis is a test': 'hello world\nthis is a test',
'<em>That\'s</em> not a test, <strong>this</strong> is a test':
'*That\'s* not a test, **this** is a test',
'Visit <del><a href="http://example.com">our website</a></del> (outdated)':
'Visit ~~🔗our website~~ (outdated)',
'(cw spiders) <span data-mx-spoiler>spiders are pretty cool</span>':
'(cw spiders) ███████████████████████',
'<span data-mx-spoiler="cw spiders">spiders are pretty cool</span>':
'(cw spiders) ███████████████████████',
'<img src="test.gif" alt="a test case" />': 'a test case',
'List of cute animals:\n<ul>\n<li>Kittens</li>\n<li>Puppies</li>\n<li>Snakes<br/>(I think they\'re cute!)</li>\n</ul>\n(This list is incomplete, you can help by adding to it!)':
'List of cute animals:\n● Kittens\n● Puppies\n● Snakes\n (I think they\'re cute!)\n(This list is incomplete, you can help by adding to it!)',
'<em>fox</em>': '*fox*',
'<i>fox</i>': '*fox*',
'<strong>fox</i>': '**fox**',
'<b>fox</b>': '**fox**',
'<u>fox</u>': '__fox__',
'<ins>fox</ins>': '__fox__',
'<del>fox</del>': '~~fox~~',
'<strike>fox</strike>': '~~fox~~',
'<s>fox</s>': '~~fox~~',
'<code>&gt;fox</code>': '`>fox`',
'<pre>meep</pre>': '```\nmeep\n```',
'<pre>meep\n</pre>': '```\nmeep\n```',
'<pre><code class="language-floof">meep</code></pre>':
'```floof\nmeep\n```',
'before<pre>code</pre>after': 'before\n```\ncode\n```\nafter',
'<p>before</p><pre>code</pre><p>after</p>':
'before\n```\ncode\n```\nafter',
'<p>fox</p>': 'fox',
'<p>fox</p><p>floof</p>': 'fox\n\nfloof',
'<a href="https://example.org">website</a>': '🔗website',
'<a href="https://matrix.to/#/@user:example.org">fox</a>': 'fox',
'<a href="matrix:u/user:example.org">fox</a>': 'fox',
'<img alt=":wave:" src="mxc://fox">': ':wave:',
'fox<br>floof': 'fox\nfloof',
'<blockquote>fox</blockquote>floof': '> fox\nfloof',
'<blockquote><p>fox</p></blockquote>floof': '> fox\nfloof',
'<blockquote><p>fox</p></blockquote><p>floof</p>': '> fox\nfloof',
'a<blockquote>fox</blockquote>floof': 'a\n> fox\nfloof',
'<blockquote><blockquote>fox</blockquote>floof</blockquote>fluff':
'> > fox\n> floof\nfluff',
'<ul><li>hey<ul><li>a</li><li>b</li></ul></li><li>foxies</li></ul>':
'● hey\n ○ a\n ○ b\n● foxies',
'<ol><li>a</li><li>b</li></ol>': '1. a\n2. b',
'<ol><li>a<ol><li>aa</li><li>bb</li></ol></li><li>b</li></ol>':
'1. a\n 1. aa\n 2. bb\n2. b',
'<ol><li>a<ul><li>aa</li><li>bb</li></ul></li><li>b</li></ol>':
'1. a\n ○ aa\n ○ bb\n2. b',
'<ul><li>a<ol><li>aa</li><li>bb</li></ol></li><li>b</li></ul>':
'● a\n 1. aa\n 2. bb\n● b',
'<mx-reply>bunny</mx-reply>fox': 'fox',
'fox<hr>floof': 'fox\n----------\nfloof',
'<p>fox</p><hr><p>floof</p>': 'fox\n----------\nfloof',
'<h1>fox</h1>floof': '# fox\nfloof',
'<h1>fox</h1><p>floof</p>': '# fox\nfloof',
'floof<h1>fox</h1>': 'floof\n# fox',
'<p>floof</p><h1>fox</h1>': 'floof\n# fox',
'<h2>fox</h2>': '## fox',
'<h3>fox</h3>': '### fox',
'<h4>fox</h4>': '#### fox',
'<h5>fox</h5>': '##### fox',
'<h6>fox</h6>': '###### fox',
'<span>fox</span>': 'fox',
'<p>fox</p>\n<p>floof</p>': 'fox\n\nfloof',
};
for (final entry in testMap.entries) {
expect(HtmlToText.convert(entry.key), entry.value);
}
});
});
}