diff --git a/lib/src/event.dart b/lib/src/event.dart index ec5b6ae6..03460610 100644 --- a/lib/src/event.dart +++ b/lib/src/event.dart @@ -27,6 +27,7 @@ import 'utils/matrix_localizations.dart'; import 'utils/receipt.dart'; import 'utils/event_localizations.dart'; import 'utils/crypto/encrypted_file.dart'; +import 'utils/html_to_text.dart'; abstract class RelationshipTypes { static const String reply = 'm.in_reply_to'; @@ -286,6 +287,12 @@ class Event extends MatrixEvent { return '$type'; } + /// Use this to get a plain-text representation of the event, stripping things + /// like spoilers and thelike. Useful for plain text notifications. + String get plaintextBody => content['format'] == 'org.matrix.custom.html' + ? HtmlToText.convert(formattedText) + : body; + /// Returns a list of [Receipt] instances for this event. List get receipts { if (!(room.roomAccountData.containsKey('m.receipt'))) return []; diff --git a/lib/src/utils/html_to_text.dart b/lib/src/utils/html_to_text.dart new file mode 100644 index 00000000..e2f4e702 --- /dev/null +++ b/lib/src/utils/html_to_text.dart @@ -0,0 +1,243 @@ +/* + * Famedly Matrix SDK + * Copyright (C) 2021 Famedly GmbH + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +import 'package:html/parser.dart'; +import 'package:html/dom.dart'; +import 'package:html_unescape/html_unescape.dart'; + +class HtmlToText { + /// Convert an HTML string to a pseudo-markdown plain text representation, with + /// `data-mx-spoiler` spans redacted + static String convert(String html) { + final opts = _ConvertOpts(); + var reply = _walkNode(opts, parseFragment(html)); + reply = reply.replaceAll(RegExp(r'\s*$', multiLine: false), ''); + return reply; + } + + static String _parsePreContent(_ConvertOpts opts, Element node) { + var text = node.innerHtml; + final match = + RegExp(r'^]*)>', multiLine: false, caseSensitive: false) + .firstMatch(text); + if (match == null) { + text = HtmlUnescape().convert(text); + if (text[0] != '\n') { + text = '\n$text'; + } + if (text[text.length - 1] != '\n') { + text += '\n'; + } + return text; + } + // remove opening tag + text = text.substring(match.end); + // remove the closing tag + text = text.replaceAll( + RegExp(r'$', multiLine: false, caseSensitive: false), ''); + text = HtmlUnescape().convert(text); + if (text[0] != '\n') { + text = '\n$text'; + } + if (text[text.length - 1] != '\n') { + text += '\n'; + } + final language = + RegExp(r'language-(\w+)', multiLine: false, caseSensitive: false) + .firstMatch(match.group(1)); + if (language != null) { + text = language.group(1) + text; + } + return text; + } + + static String _parseBlockquoteContent(_ConvertOpts opts, Element node) { + final msg = _walkChildNodes(opts, node); + return msg.split('\n').map((s) => '> $s').join('\n') + '\n'; + } + + static String _parseSpanContent(_ConvertOpts opts, Element node) { + final content = _walkChildNodes(opts, node); + if (node.attributes['data-mx-spoiler'] is String) { + var spoiler = '█' * content.length; + final reason = node.attributes['data-mx-spoiler']; + if (reason != '') { + spoiler = '($reason) $spoiler'; + } + return spoiler; + } + return content; + } + + static String _parseUlContent(_ConvertOpts opts, Element node) { + opts.listDepth++; + final entries = _listChildNodes(opts, node, {'li'}); + opts.listDepth--; + final bulletPoint = + _listBulletPoints[opts.listDepth % _listBulletPoints.length]; + + return entries + .map((s) => + (' ' * opts.listDepth) + + bulletPoint + + ' ' + + s.replaceAll('\n', '\n' + (' ' * opts.listDepth) + ' ')) + .join('\n'); + } + + static String _parseOlContent(_ConvertOpts opts, Element node) { + opts.listDepth++; + final entries = _listChildNodes(opts, node, {'li'}); + opts.listDepth--; + var entry = 0; + if (node.attributes['start'] is String && + RegExp(r'^[0-9]+$', multiLine: false) + .hasMatch(node.attributes['start'])) { + entry = int.parse(node.attributes['start']); + } + + return entries.map((s) { + entry++; + return (' ' * opts.listDepth) + + '$entry. ' + + s.replaceAll('\n', '\n' + (' ' * opts.listDepth) + ' '); + }).join('\n'); + } + + static const _listBulletPoints = ['●', '○', '■', '‣']; + + static List _listChildNodes(_ConvertOpts opts, Element node, + [Iterable types]) { + final replies = []; + for (final child in node.nodes) { + if (types != null && + types.isNotEmpty && + ((child is Text) || + ((child is Element) && + !types.contains(child.localName.toLowerCase())))) { + continue; + } + replies.add(_walkNode(opts, child)); + } + return replies; + } + + static const _blockTags = { + 'blockquote', + 'ul', + 'ol', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'pre', + }; + + static String _walkChildNodes(_ConvertOpts opts, Node node) { + var reply = ''; + var lastTag = ''; + for (final child in node.nodes) { + final thisTag = child is Element ? child.localName.toLowerCase() : ''; + if (thisTag == 'p' && lastTag == 'p') { + reply += '\n\n'; + } else if (_blockTags.contains(thisTag) && + reply.isNotEmpty && + reply[reply.length - 1] != '\n') { + reply += '\n'; + } + reply += _walkNode(opts, child); + if (thisTag.isNotEmpty) { + lastTag = thisTag; + } + } + return reply; + } + + static String _walkNode(_ConvertOpts opts, Node node) { + if (node is Text) { + // ignore \n between single nodes + return node.text == '\n' ? '' : node.text; + } else if (node is Element) { + final tag = node.localName.toLowerCase(); + switch (tag) { + case 'em': + case 'i': + return '*${_walkChildNodes(opts, node)}*'; + case 'strong': + case 'b': + return '**${_walkChildNodes(opts, node)}**'; + case 'u': + case 'ins': + return '__${_walkChildNodes(opts, node)}__'; + case 'del': + case 'strike': + case 's': + return '~~${_walkChildNodes(opts, node)}~~'; + case 'code': + return '`${node.text}`'; + case 'pre': + return '```${_parsePreContent(opts, node)}```\n'; + case 'a': + final href = node.attributes['href'] ?? ''; + final content = _walkChildNodes(opts, node); + if (href.toLowerCase().startsWith('https://matrix.to/#/') || + href.toLowerCase().startsWith('matrix:')) { + return content; + } + return '🔗$content'; + case 'img': + return node.attributes['alt'] ?? + node.attributes['title'] ?? + node.attributes['src'] ?? + ''; + case 'br': + return '\n'; + case 'blockquote': + return _parseBlockquoteContent(opts, node); + case 'ul': + return _parseUlContent(opts, node); + case 'ol': + return _parseOlContent(opts, node); + case 'mx-reply': + return ''; + case 'hr': + return '\n----------\n'; + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + final mark = '#' * int.parse(tag[1]); + return '$mark ${_walkChildNodes(opts, node)}\n'; + case 'span': + return _parseSpanContent(opts, node); + default: + return _walkChildNodes(opts, node); + } + } else { + return _walkChildNodes(opts, node); + } + } +} + +class _ConvertOpts { + int listDepth = 0; +} diff --git a/pubspec.yaml b/pubspec.yaml index 1b7c4142..d9a2f40f 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -21,6 +21,7 @@ dependencies: ffi: ^1.0.0 js: ^0.6.3 slugify: ^2.0.0 + html: ^0.15.0 dev_dependencies: pedantic: ^1.11.0 diff --git a/test/event_test.dart b/test/event_test.dart index a0cb2c57..725aaf62 100644 --- a/test/event_test.dart +++ b/test/event_test.dart @@ -945,6 +945,20 @@ void main() { expect( event.aggregatedEvents(timeline, RelationshipTypes.edit), {}); }); + test('plaintextBody', () { + final event = Event.fromJson({ + 'type': EventTypes.Message, + 'content': { + 'body': 'blah', + 'msgtype': 'm.text', + 'format': 'org.matrix.custom.html', + 'formatted_body': 'blah', + }, + 'event_id': '\$source', + 'sender': '@alice:example.org', + }, null); + expect(event.plaintextBody, '**blah**'); + }); test('getDisplayEvent', () { var event = Event.fromJson({ 'type': EventTypes.Message, diff --git a/test/html_to_text_test.dart b/test/html_to_text_test.dart new file mode 100644 index 00000000..63803aac --- /dev/null +++ b/test/html_to_text_test.dart @@ -0,0 +1,98 @@ +/* + * Famedly Matrix SDK + * Copyright (C) 2021 Famedly GmbH + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +import 'package:matrix/src/utils/html_to_text.dart'; +import 'package:test/test.dart'; + +void main() { + group('htmlToText', () { + test('stuff', () async { + final testMap = { + '': '', + 'hello world\nthis is a test': 'hello world\nthis is a test', + 'That\'s not a test, this is a test': + '*That\'s* not a test, **this** is a test', + 'Visit our website (outdated)': + 'Visit ~~🔗our website~~ (outdated)', + '(cw spiders) spiders are pretty cool': + '(cw spiders) ███████████████████████', + 'spiders are pretty cool': + '(cw spiders) ███████████████████████', + 'a test case': 'a test case', + 'List of cute animals:\n
    \n
  • Kittens
  • \n
  • Puppies
  • \n
  • Snakes
    (I think they\'re cute!)
  • \n
\n(This list is incomplete, you can help by adding to it!)': + 'List of cute animals:\n● Kittens\n● Puppies\n● Snakes\n (I think they\'re cute!)\n(This list is incomplete, you can help by adding to it!)', + 'fox': '*fox*', + 'fox': '*fox*', + 'fox': '**fox**', + 'fox': '**fox**', + 'fox': '__fox__', + 'fox': '__fox__', + 'fox': '~~fox~~', + 'fox': '~~fox~~', + 'fox': '~~fox~~', + '>fox': '`>fox`', + '
meep
': '```\nmeep\n```', + '
meep\n
': '```\nmeep\n```', + '
meep
': + '```floof\nmeep\n```', + 'before
code
after': 'before\n```\ncode\n```\nafter', + '

before

code

after

': + 'before\n```\ncode\n```\nafter', + '

fox

': 'fox', + '

fox

floof

': 'fox\n\nfloof', + 'website': '🔗website', + 'fox': 'fox', + 'fox': 'fox', + ':wave:': ':wave:', + 'fox
floof': 'fox\nfloof', + '
fox
floof': '> fox\nfloof', + '

fox

floof': '> fox\nfloof', + '

fox

floof

': '> fox\nfloof', + 'a
fox
floof': 'a\n> fox\nfloof', + '
fox
floof
fluff': + '> > fox\n> floof\nfluff', + '
  • hey
    • a
    • b
  • foxies
': + '● hey\n ○ a\n ○ b\n● foxies', + '
  1. a
  2. b
': '1. a\n2. b', + '
  1. a
    1. aa
    2. bb
  2. b
': + '1. a\n 1. aa\n 2. bb\n2. b', + '
  1. a
    • aa
    • bb
  2. b
': + '1. a\n ○ aa\n ○ bb\n2. b', + '
  • a
    1. aa
    2. bb
  • b
': + '● a\n 1. aa\n 2. bb\n● b', + 'bunnyfox': 'fox', + 'fox
floof': 'fox\n----------\nfloof', + '

fox


floof

': 'fox\n----------\nfloof', + '

fox

floof': '# fox\nfloof', + '

fox

floof

': '# fox\nfloof', + 'floof

fox

': 'floof\n# fox', + '

floof

fox

': 'floof\n# fox', + '

fox

': '## fox', + '

fox

': '### fox', + '

fox

': '#### fox', + '
fox
': '##### fox', + '
fox
': '###### fox', + 'fox': 'fox', + '

fox

\n

floof

': 'fox\n\nfloof', + }; + for (final entry in testMap.entries) { + expect(HtmlToText.convert(entry.key), entry.value); + } + }); + }); +}