matrix-dart-sdk/lib/src/utils/html_to_text.dart

/*
 *   Famedly Matrix SDK
 *   Copyright (C) 2021 Famedly GmbH
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU Affero General Public License as
 *   published by the Free Software Foundation, either version 3 of the
 *   License, or (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *   GNU Affero General Public License for more details.
 *
 *   You should have received a copy of the GNU Affero General Public License
 *   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

import 'package:collection/collection.dart';
import 'package:html/dom.dart';
import 'package:html/parser.dart';
import 'package:html_unescape/html_unescape.dart';

class HtmlToText {
  /// Convert an HTML string to a pseudo-markdown plain text representation, with
  /// `data-mx-spoiler` spans redacted
  static String convert(String html) {
    // riot-web is notorious for creating bad reply fallback events from invalid messages which, if
    // not handled properly, can lead to impersonation. As such, we strip the entire `<mx-reply>` tags
    // here already, to prevent that from happening.
    // We do *not* do this in an AST and just with simple regex here, as riot-web tends to create
    // miss-matching tags, and this way we actually correctly identify what we want to strip and, well,
    // strip it.
    final renderHtml = html.replaceAll(
        RegExp('<mx-reply>.*<\/mx-reply>',
            caseSensitive: false, multiLine: false, dotAll: true),
        '');

    final opts = _ConvertOpts();
    var reply = _walkNode(opts, parseFragment(renderHtml));
    reply = reply.replaceAll(RegExp(r'\s*$', multiLine: false), '');
    return reply;
  }

  static String _parsePreContent(_ConvertOpts opts, Element node) {
    var text = node.innerHtml;
    final match =
        RegExp(r'^<code([^>]*)>', multiLine: false, caseSensitive: false)
            .firstMatch(text);
    if (match == null) {
      text = HtmlUnescape().convert(text);
      if (text.isNotEmpty) {
        if (text[0] != '\n') {
          text = '\n$text';
        }
        if (text[text.length - 1] != '\n') {
          text += '\n';
        }
      }
      return text;
    }
    // remove <code> opening tag
    text = text.substring(match.end);
    // remove the </code> closing tag
    text = text.replaceAll(
        RegExp(r'</code>$', multiLine: false, caseSensitive: false), '');
    text = HtmlUnescape().convert(text);
    if (text.isNotEmpty) {
      if (text[0] != '\n') {
        text = '\n$text';
      }
      if (text[text.length - 1] != '\n') {
        text += '\n';
      }
    }
    final language =
        RegExp(r'language-(\w+)', multiLine: false, caseSensitive: false)
            .firstMatch(match.group(1)!);
    if (language != null) {
      text = language.group(1)! + text;
    }
    return text;
  }

  static String _parseBlockquoteContent(_ConvertOpts opts, Element node) {
    final msg = _walkChildNodes(opts, node);
    return msg.split('\n').map((s) => '> $s').join('\n') + '\n';
  }

  static String _parseSpanContent(_ConvertOpts opts, Element node) {
    final content = _walkChildNodes(opts, node);
    if (node.attributes['data-mx-spoiler'] is String) {
      var spoiler = '█' * content.length;
      final reason = node.attributes['data-mx-spoiler'];
      if (reason != '') {
        spoiler = '($reason) $spoiler';
      }
      return spoiler;
    }
    return content;
  }

  static String _parseUlContent(_ConvertOpts opts, Element node) {
    opts.listDepth++;
    final entries = _listChildNodes(opts, node, {'li'});
    opts.listDepth--;
    final bulletPoint =
        _listBulletPoints[opts.listDepth % _listBulletPoints.length];

    return entries
        .map((s) =>
            ('    ' * opts.listDepth) +
            bulletPoint +
            ' ' +
            s.replaceAll('\n', '\n' + ('    ' * opts.listDepth) + '  '))
        .join('\n');
  }

  static String _parseOlContent(_ConvertOpts opts, Element node) {
    opts.listDepth++;
    final entries = _listChildNodes(opts, node, {'li'});
    opts.listDepth--;
    final startStr = node.attributes['start'];
    final start = (startStr is String &&
            RegExp(r'^[0-9]+$', multiLine: false).hasMatch(startStr))
        ? int.parse(startStr)
        : 1;

    return entries
        .mapIndexed((index, s) =>
            ('    ' * opts.listDepth) +
            '${start + index}. ' +
            s.replaceAll('\n', '\n' + ('    ' * opts.listDepth) + '  '))
        .join('\n');
  }

  static const _listBulletPoints = <String>['●', '○', '■', '‣'];

  static List<String> _listChildNodes(_ConvertOpts opts, Element node,
      [Iterable<String>? types]) {
    final replies = <String>[];
    for (final child in node.nodes) {
      if (types != null &&
          types.isNotEmpty &&
          ((child is Text) ||
              ((child is Element) &&
                  !types.contains(child.localName!.toLowerCase())))) {
        continue;
      }
      replies.add(_walkNode(opts, child));
    }
    return replies;
  }

  static const _blockTags = <String>{
    'blockquote',
    'ul',
    'ol',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'pre',
  };

  static String _walkChildNodes(_ConvertOpts opts, Node node) {
    var reply = '';
    var lastTag = '';
    for (final child in node.nodes) {
      final thisTag = child is Element ? child.localName!.toLowerCase() : '';
      if (thisTag == 'p' && lastTag == 'p') {
        reply += '\n\n';
      } else if (_blockTags.contains(thisTag) &&
          reply.isNotEmpty &&
          reply[reply.length - 1] != '\n') {
        reply += '\n';
      }
      reply += _walkNode(opts, child);
      if (thisTag.isNotEmpty) {
        lastTag = thisTag;
      }
    }
    return reply;
  }

  static String _walkNode(_ConvertOpts opts, Node node) {
    if (node is Text) {
      // ignore \n between single nodes
      return node.text == '\n' ? '' : node.text;
    } else if (node is Element) {
      final tag = node.localName!.toLowerCase();
      switch (tag) {
        case 'em':
        case 'i':
          return '*${_walkChildNodes(opts, node)}*';
        case 'strong':
        case 'b':
          return '**${_walkChildNodes(opts, node)}**';
        case 'u':
        case 'ins':
          return '__${_walkChildNodes(opts, node)}__';
        case 'del':
        case 'strike':
        case 's':
          return '~~${_walkChildNodes(opts, node)}~~';
        case 'code':
          return '`${node.text}`';
        case 'pre':
          return '```${_parsePreContent(opts, node)}```\n';
        case 'a':
          final href = node.attributes['href'] ?? '';
          final content = _walkChildNodes(opts, node);
          if (href.toLowerCase().startsWith('https://matrix.to/#/') ||
              href.toLowerCase().startsWith('matrix:')) {
            return content;
          }
          return '🔗$content';
        case 'img':
          return node.attributes['alt'] ??
              node.attributes['title'] ??
              node.attributes['src'] ??
              '';
        case 'br':
          return '\n';
        case 'blockquote':
          return _parseBlockquoteContent(opts, node);
        case 'ul':
          return _parseUlContent(opts, node);
        case 'ol':
          return _parseOlContent(opts, node);
        case 'mx-reply':
          return '';
        case 'hr':
          return '\n----------\n';
        case 'h1':
        case 'h2':
        case 'h3':
        case 'h4':
        case 'h5':
        case 'h6':
          final mark = '#' * int.parse(tag[1]);
          return '$mark ${_walkChildNodes(opts, node)}\n';
        case 'span':
          return _parseSpanContent(opts, node);
        default:
          return _walkChildNodes(opts, node);
      }
    } else {
      return _walkChildNodes(opts, node);
    }
  }
}

class _ConvertOpts {
  int listDepth = 0;
}