server: ol/ul for html->mfm

Implements a simple rendering for ordered and unordered lists. To properly support nested lists, and lists inside of blockquotes, this necessitated rewriting the `analyze` function to return text. This now also means that markup inside of blockquotes will be parsed properly.
2024-08-09 00:08:23 +02:00 · 2024-08-09 00:08:23 +02:00 · 650869e2f7
commit 650869e2f7
parent 2ed5ecd6a1
1 changed files with 92 additions and 82 deletions
--- a/packages/backend/src/mfm/from-html.ts
+++ b/packages/backend/src/mfm/from-html.ts
@ -23,13 +23,14 @@ export function fromHtml(html: string, quoteUri?: string | null): string {
 		html.replace(/<br\s?\/?>\r?\n/gi, '\n'),
 	);

-	let text = '';
+	// stores if we are parsing any lists.
+	// 0 for a level that is an unordered list, otherwise the counter for the ordered list
+	let listIndex: number[] = [];

-	for (const n of dom.childNodes) {
-		analyze(n);
-	}
-
-	return text.trim();
+	return dom.childNodes
+		.map(analyze)
+		.join('')
+		.trim();

 	function getText(node: TreeAdapter.Node): string {
 		if (treeAdapter.isTextNode(node)) return node.value;
@ -43,60 +44,55 @@ export function fromHtml(html: string, quoteUri?: string | null): string {
 		return '';
 	}

-	function appendChildren(childNodes: TreeAdapter.ChildNode[]): void {
-		if (childNodes.length > 0) {
-			for (const n of childNodes) {
-				analyze(n);
-			}
-		}
+	function analyzeMultiple(childNodes: TreeAdapter.ChildNode[]): string {
+		return childNodes.map(analyze).join('');
 	}

-	function analyze(node: TreeAdapter.Node): void {
+	function analyze(node: TreeAdapter.Node): string {
 		if (treeAdapter.isTextNode(node)) {
-			text += node.value;
-			return;
+			return node.value;
 		}

 		// Skip comment or document type node
-		if (!treeAdapter.isElementNode(node)) return;
+		if (!treeAdapter.isElementNode(node)) return '';

 		switch (node.nodeName) {
 			case 'br': {
-				text += '\n';
-				break;
+				return '\n';
 			}

 			case 'a':
 			{
+				let text = '';
 				// trim spaces away, because some AP servers (app.wafrn.net) send strange
 				// zero width non-break space in strange places and things like that
-				const txt = getText(node).trim();
+				const linkText = getText(node).trim();
 				const href = getAttr(node, 'href');

 				// hashtags
-				if (txt.startsWith('#') && href && (attrHas(node, 'rel', 'tag') || attrHas(node, 'class', 'hashtag'))) {
-					text += txt;
+				if (linkText.startsWith('#') && href && (attrHas(node, 'rel', 'tag') || attrHas(node, 'class', 'hashtag'))) {
+					text += linkText;
 				// mentions: a link that starts with `@` and does not include space
-				} else if (txt.startsWith('@') && txt.match(/\s/) == null && !attrHas(node, 'rel', 'me')) {
-					const part = txt.split('@');
+				} else if (linkText.startsWith('@') && linkText.match(/\s/) == null && !attrHas(node, 'rel', 'me')) {
+					const part = linkText.split('@');

 					if (part.length === 2 && href) {
 						// restore the host name part
-						const acct = `${txt}@${(new URL(href)).hostname}`;
+						const acct = `${linkText}@${(new URL(href)).hostname}`;
 						text += acct;
 					} else if (part.length === 3) {
-						text += txt;
+						text += linkText;
 					}
 				// other
 				} else {
 					const generateLink = () => {
-						if (!href && !txt) {
+						if (!href && !linkText) {
 							return '';
 						}
 						if (!href) {
-							return txt;
+							return linkText;
 						}
-						if (!txt || txt === href) { // #6383: Missing text node
+						if (!linkText || linkText === href) { // #6383: Missing text node
 							if (href.match(urlRegexFull)) {
 								return href;
 							} else {
@ -104,78 +100,57 @@ export function fromHtml(html: string, quoteUri?: string | null): string {
 							}
 						}
 						if (href.match(urlRegex) && !href.match(urlRegexFull)) {
-							return `[${txt}](<${href}>)`; // #6846
+							return `[${linkText}](<${href}>)`; // #6846
 						} else {
-							return `[${txt}](${href})`;
+							return `[${linkText}](${href})`;
 						}
 					};

 					text += generateLink();
 				}
-				break;
+				return text;
 			}

 			case 'h1':
 			{
-				text += '【';
-				appendChildren(node.childNodes);
-				text += '】\n';
-				break;
+				return '【' + analyzeMultiple(node.childNodes) + '】\n';
 			}

 			case 'b':
 			case 'strong':
 			{
-				text += '**';
-				appendChildren(node.childNodes);
-				text += '**';
-				break;
+				return '**' + analyzeMultiple(node.childNodes) + '**';
 			}

 			case 'small':
 			{
-				text += '<small>';
-				appendChildren(node.childNodes);
-				text += '</small>';
-				break;
+				return '<small>' + analyzeMultiple(node.childNodes) + '</small>';
 			}

 			case 's':
 			case 'del':
 			{
-				text += '~~';
-				appendChildren(node.childNodes);
-				text += '~~';
-				break;
+				return '~~' + analyzeMultiple(node.childNodes) + '~~';
 			}

 			case 'i':
 			case 'em':
 			{
-				text += '<i>';
-				appendChildren(node.childNodes);
-				text += '</i>';
-				break;
+				return '<i>' + analyzeMultiple(node.childNodes) + '</i>';
 			}

 			// block code (<pre><code>)
 			case 'pre': {
 				if (node.childNodes.length === 1 && node.childNodes[0].nodeName === 'code') {
-					text += '\n```\n';
-					text += getText(node.childNodes[0]);
-					text += '\n```\n';
+					return '\n```\n' + getText(node.childNodes[0]) + '\n```\n';
 				} else {
-					appendChildren(node.childNodes);
+					return analyzeMultiple(node.childNodes);
 				}
-				break;
 			}

 			// inline code (<code>)
 			case 'code': {
-				text += '`';
-				appendChildren(node.childNodes);
-				text += '`';
-				break;
+				return '`' + analyzeMultiple(node.childNodes) + '`';
 			}

 			// inline or block KaTeX
@ -207,31 +182,27 @@ export function fromHtml(html: string, quoteUri?: string | null): string {
 					// can be rendered as KaTeX, now decide if it is possible to render as inline or not
 					if (/[\r\n]/.test(formula)) {
 						// line break, this must be rendered as a block
-						text += '\n\\[' + formula + '\\]\n';
+						return '\n\\[' + formula + '\\]\n';
 					} else {
 						// render as inline
-						text += '\\(' + formula + '\\)';
+						return '\\(' + formula + '\\)';
 					}
 				} else {
 					// not KaTeX, but if there is a plaintext annotation it can still be rendered as code
 					if (/[\r\n]/.test(formula)) {
 						// line break, this must be rendered as a block
-						text += '\n```\n' + formula + '\n```\n';
+						return '\n```\n' + formula + '\n```\n';
 					} else {
 						// render as inline
-						text += '`' + formula + '`';
+						return '`' + formula + '`';
 					}
 				}
-				break;
 			}

 			case 'blockquote': {
-				const t = getText(node);
-				if (t) {
-					text += '\n> ';
-					text += t.split('\n').join('\n> ');
-				}
-				break;
+				return analyzeMultiple(node.childNodes)
+					.trim()
+					.replace(/^|\n/g, '\n>');
 			}

 			case 'p':
@ -241,9 +212,50 @@ export function fromHtml(html: string, quoteUri?: string | null): string {
 			case 'h5':
 			case 'h6':
 			{
-				text += '\n\n';
-				appendChildren(node.childNodes);
-				break;
+				return '\n\n' + analyzeMultiple(node.childNodes);
+			}
+
+			// lists and list items
+			case 'ol':
+			case 'ul':
+			{
+				if (node.nodeName == 'ol') {
+					listIndex.push(1);
+				} else {
+					listIndex.push(0);
+				}
+				let text = '\n' + analyzeMultiple(node.childNodes);
+				listIndex.pop();
+				return text;
+			}
+
+			case 'li':
+			{
+				if (listIndex.length == 0) {
+					break;
+				}
+				let text = '\n';
+
+				// pop the current operating on index for manipulation
+				let index = listIndex.pop();
+				// indent the start of the list item respecitve of the level of
+				// nesting of lists
+				//
+				// since we popped the current index, the length will be 0 on
+				// the first level, thus causing no indent on the first level
+				text += '  '.repeat(listIndex.length);
+				if (index == 0) {
+					text += '- ';
+				} else {
+					text += index + ') ';
+					index++;
+				}
+				// done with the index, put it back so nested lists with
+				// analyzeMultiple will work correctly
+				listIndex.push(index);
+
+				text += analyzeMultiple(node.childNodes);
+				return text;
 			}

 			// other block elements
@ -251,30 +263,28 @@ export function fromHtml(html: string, quoteUri?: string | null): string {
 			case 'header':
 			case 'footer':
 			case 'article':
-			case 'li':
 			case 'dt':
 			case 'dd':
 			{
-				text += '\n';
-				appendChildren(node.childNodes);
-				break;
+				return '\n' + analyzeMultiple(node.childNodes);
 			}

 			case 'span':
 			{
 				if (attrHas(node, 'class', 'quote-inline') && quoteUri && getText(node).trim() === `RE: ${quoteUri}`) {
 					// embedded quote thingy for backwards compatibility, don't show it
+					return '';
 				} else {
-					appendChildren(node.childNodes);
+					return analyzeMultiple(node.childNodes);
 				}
-				break;
 			}

 			default:	// includes inline elements
 			{
-				appendChildren(node.childNodes);
-				break;
+				return analyzeMultiple(node.childNodes);
 			}
 		}
+
+		return '';
 	}
 }