259 lines
		
	
	
		
			8.0 KiB
		
	
	
	
		
			Dart
		
	
	
	
			
		
		
	
	
			259 lines
		
	
	
		
			8.0 KiB
		
	
	
	
		
			Dart
		
	
	
	
| /*
 | |
|  *   Famedly Matrix SDK
 | |
|  *   Copyright (C) 2021 Famedly GmbH
 | |
|  *
 | |
|  *   This program is free software: you can redistribute it and/or modify
 | |
|  *   it under the terms of the GNU Affero General Public License as
 | |
|  *   published by the Free Software Foundation, either version 3 of the
 | |
|  *   License, or (at your option) any later version.
 | |
|  *
 | |
|  *   This program is distributed in the hope that it will be useful,
 | |
|  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 | |
|  *   GNU Affero General Public License for more details.
 | |
|  *
 | |
|  *   You should have received a copy of the GNU Affero General Public License
 | |
|  *   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 | |
|  */
 | |
| 
 | |
| import 'package:collection/collection.dart';
 | |
| import 'package:html/dom.dart';
 | |
| import 'package:html/parser.dart';
 | |
| import 'package:html_unescape/html_unescape.dart';
 | |
| 
 | |
| class HtmlToText {
 | |
|   /// Convert an HTML string to a pseudo-markdown plain text representation, with
 | |
|   /// `data-mx-spoiler` spans redacted
 | |
|   static String convert(String html) {
 | |
|     // riot-web is notorious for creating bad reply fallback events from invalid messages which, if
 | |
|     // not handled properly, can lead to impersonation. As such, we strip the entire `<mx-reply>` tags
 | |
|     // here already, to prevent that from happening.
 | |
|     // We do *not* do this in an AST and just with simple regex here, as riot-web tends to create
 | |
|     // miss-matching tags, and this way we actually correctly identify what we want to strip and, well,
 | |
|     // strip it.
 | |
|     final renderHtml = html.replaceAll(
 | |
|         RegExp('<mx-reply>.*<\/mx-reply>',
 | |
|             caseSensitive: false, multiLine: false, dotAll: true),
 | |
|         '');
 | |
| 
 | |
|     final opts = _ConvertOpts();
 | |
|     var reply = _walkNode(opts, parseFragment(renderHtml));
 | |
|     reply = reply.replaceAll(RegExp(r'\s*$', multiLine: false), '');
 | |
|     return reply;
 | |
|   }
 | |
| 
 | |
|   static String _parsePreContent(_ConvertOpts opts, Element node) {
 | |
|     var text = node.innerHtml;
 | |
|     final match =
 | |
|         RegExp(r'^<code([^>]*)>', multiLine: false, caseSensitive: false)
 | |
|             .firstMatch(text);
 | |
|     if (match == null) {
 | |
|       text = HtmlUnescape().convert(text);
 | |
|       if (text.isNotEmpty) {
 | |
|         if (text[0] != '\n') {
 | |
|           text = '\n$text';
 | |
|         }
 | |
|         if (text[text.length - 1] != '\n') {
 | |
|           text += '\n';
 | |
|         }
 | |
|       }
 | |
|       return text;
 | |
|     }
 | |
|     // remove <code> opening tag
 | |
|     text = text.substring(match.end);
 | |
|     // remove the </code> closing tag
 | |
|     text = text.replaceAll(
 | |
|         RegExp(r'</code>$', multiLine: false, caseSensitive: false), '');
 | |
|     text = HtmlUnescape().convert(text);
 | |
|     if (text.isNotEmpty) {
 | |
|       if (text[0] != '\n') {
 | |
|         text = '\n$text';
 | |
|       }
 | |
|       if (text[text.length - 1] != '\n') {
 | |
|         text += '\n';
 | |
|       }
 | |
|     }
 | |
|     final language =
 | |
|         RegExp(r'language-(\w+)', multiLine: false, caseSensitive: false)
 | |
|             .firstMatch(match.group(1)!);
 | |
|     if (language != null) {
 | |
|       text = language.group(1)! + text;
 | |
|     }
 | |
|     return text;
 | |
|   }
 | |
| 
 | |
|   static String _parseBlockquoteContent(_ConvertOpts opts, Element node) {
 | |
|     final msg = _walkChildNodes(opts, node);
 | |
|     return msg.split('\n').map((s) => '> $s').join('\n') + '\n';
 | |
|   }
 | |
| 
 | |
|   static String _parseSpanContent(_ConvertOpts opts, Element node) {
 | |
|     final content = _walkChildNodes(opts, node);
 | |
|     if (node.attributes['data-mx-spoiler'] is String) {
 | |
|       var spoiler = '█' * content.length;
 | |
|       final reason = node.attributes['data-mx-spoiler'];
 | |
|       if (reason != '') {
 | |
|         spoiler = '($reason) $spoiler';
 | |
|       }
 | |
|       return spoiler;
 | |
|     }
 | |
|     return content;
 | |
|   }
 | |
| 
 | |
|   static String _parseUlContent(_ConvertOpts opts, Element node) {
 | |
|     opts.listDepth++;
 | |
|     final entries = _listChildNodes(opts, node, {'li'});
 | |
|     opts.listDepth--;
 | |
|     final bulletPoint =
 | |
|         _listBulletPoints[opts.listDepth % _listBulletPoints.length];
 | |
| 
 | |
|     return entries
 | |
|         .map((s) =>
 | |
|             ('    ' * opts.listDepth) +
 | |
|             bulletPoint +
 | |
|             ' ' +
 | |
|             s.replaceAll('\n', '\n' + ('    ' * opts.listDepth) + '  '))
 | |
|         .join('\n');
 | |
|   }
 | |
| 
 | |
|   static String _parseOlContent(_ConvertOpts opts, Element node) {
 | |
|     opts.listDepth++;
 | |
|     final entries = _listChildNodes(opts, node, {'li'});
 | |
|     opts.listDepth--;
 | |
|     final startStr = node.attributes['start'];
 | |
|     final start = (startStr is String &&
 | |
|             RegExp(r'^[0-9]+$', multiLine: false).hasMatch(startStr))
 | |
|         ? int.parse(startStr)
 | |
|         : 1;
 | |
| 
 | |
|     return entries
 | |
|         .mapIndexed((index, s) =>
 | |
|             ('    ' * opts.listDepth) +
 | |
|             '${start + index}. ' +
 | |
|             s.replaceAll('\n', '\n' + ('    ' * opts.listDepth) + '  '))
 | |
|         .join('\n');
 | |
|   }
 | |
| 
 | |
|   static const _listBulletPoints = <String>['●', '○', '■', '‣'];
 | |
| 
 | |
|   static List<String> _listChildNodes(_ConvertOpts opts, Element node,
 | |
|       [Iterable<String>? types]) {
 | |
|     final replies = <String>[];
 | |
|     for (final child in node.nodes) {
 | |
|       if (types != null &&
 | |
|           types.isNotEmpty &&
 | |
|           ((child is Text) ||
 | |
|               ((child is Element) &&
 | |
|                   !types.contains(child.localName!.toLowerCase())))) {
 | |
|         continue;
 | |
|       }
 | |
|       replies.add(_walkNode(opts, child));
 | |
|     }
 | |
|     return replies;
 | |
|   }
 | |
| 
 | |
|   static const _blockTags = <String>{
 | |
|     'blockquote',
 | |
|     'ul',
 | |
|     'ol',
 | |
|     'h1',
 | |
|     'h2',
 | |
|     'h3',
 | |
|     'h4',
 | |
|     'h5',
 | |
|     'h6',
 | |
|     'pre',
 | |
|   };
 | |
| 
 | |
|   static String _walkChildNodes(_ConvertOpts opts, Node node) {
 | |
|     var reply = '';
 | |
|     var lastTag = '';
 | |
|     for (final child in node.nodes) {
 | |
|       final thisTag = child is Element ? child.localName!.toLowerCase() : '';
 | |
|       if (thisTag == 'p' && lastTag == 'p') {
 | |
|         reply += '\n\n';
 | |
|       } else if (_blockTags.contains(thisTag) &&
 | |
|           reply.isNotEmpty &&
 | |
|           reply[reply.length - 1] != '\n') {
 | |
|         reply += '\n';
 | |
|       }
 | |
|       reply += _walkNode(opts, child);
 | |
|       if (thisTag.isNotEmpty) {
 | |
|         lastTag = thisTag;
 | |
|       }
 | |
|     }
 | |
|     return reply;
 | |
|   }
 | |
| 
 | |
|   static String _walkNode(_ConvertOpts opts, Node node) {
 | |
|     if (node is Text) {
 | |
|       // ignore \n between single nodes
 | |
|       return node.text == '\n' ? '' : node.text;
 | |
|     } else if (node is Element) {
 | |
|       final tag = node.localName!.toLowerCase();
 | |
|       switch (tag) {
 | |
|         case 'em':
 | |
|         case 'i':
 | |
|           return '*${_walkChildNodes(opts, node)}*';
 | |
|         case 'strong':
 | |
|         case 'b':
 | |
|           return '**${_walkChildNodes(opts, node)}**';
 | |
|         case 'u':
 | |
|         case 'ins':
 | |
|           return '__${_walkChildNodes(opts, node)}__';
 | |
|         case 'del':
 | |
|         case 'strike':
 | |
|         case 's':
 | |
|           return '~~${_walkChildNodes(opts, node)}~~';
 | |
|         case 'code':
 | |
|           return '`${node.text}`';
 | |
|         case 'pre':
 | |
|           return '```${_parsePreContent(opts, node)}```\n';
 | |
|         case 'a':
 | |
|           final href = node.attributes['href'] ?? '';
 | |
|           final content = _walkChildNodes(opts, node);
 | |
|           if (href.toLowerCase().startsWith('https://matrix.to/#/') ||
 | |
|               href.toLowerCase().startsWith('matrix:')) {
 | |
|             return content;
 | |
|           }
 | |
|           return '🔗$content';
 | |
|         case 'img':
 | |
|           return node.attributes['alt'] ??
 | |
|               node.attributes['title'] ??
 | |
|               node.attributes['src'] ??
 | |
|               '';
 | |
|         case 'br':
 | |
|           return '\n';
 | |
|         case 'blockquote':
 | |
|           return _parseBlockquoteContent(opts, node);
 | |
|         case 'ul':
 | |
|           return _parseUlContent(opts, node);
 | |
|         case 'ol':
 | |
|           return _parseOlContent(opts, node);
 | |
|         case 'mx-reply':
 | |
|           return '';
 | |
|         case 'hr':
 | |
|           return '\n----------\n';
 | |
|         case 'h1':
 | |
|         case 'h2':
 | |
|         case 'h3':
 | |
|         case 'h4':
 | |
|         case 'h5':
 | |
|         case 'h6':
 | |
|           final mark = '#' * int.parse(tag[1]);
 | |
|           return '$mark ${_walkChildNodes(opts, node)}\n';
 | |
|         case 'span':
 | |
|           return _parseSpanContent(opts, node);
 | |
|         default:
 | |
|           return _walkChildNodes(opts, node);
 | |
|       }
 | |
|     } else {
 | |
|       return _walkChildNodes(opts, node);
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| class _ConvertOpts {
 | |
|   int listDepth = 0;
 | |
| }
 |