From 650869e2f7935ce9634f390db918ca07d32b810c Mon Sep 17 00:00:00 2001 From: Johann150 Date: Fri, 9 Aug 2024 00:08:23 +0200 Subject: [PATCH] server: ol/ul for html->mfm Implements a simple rendering for ordered and unordered lists. To properly support nested lists, and lists inside of blockquotes, this necessitated rewriting the `analyze` function to return text. This now also means that markup inside of blockquotes will be parsed properly. --- packages/backend/src/mfm/from-html.ts | 174 ++++++++++++++------------ 1 file changed, 92 insertions(+), 82 deletions(-) diff --git a/packages/backend/src/mfm/from-html.ts b/packages/backend/src/mfm/from-html.ts index b00c86741..f6183c092 100644 --- a/packages/backend/src/mfm/from-html.ts +++ b/packages/backend/src/mfm/from-html.ts @@ -23,13 +23,14 @@ export function fromHtml(html: string, quoteUri?: string | null): string { html.replace(/\r?\n/gi, '\n'), ); - let text = ''; + // stores if we are parsing any lists. + // 0 for a level that is an unordered list, otherwise the counter for the ordered list + let listIndex: number[] = []; - for (const n of dom.childNodes) { - analyze(n); - } - - return text.trim(); + return dom.childNodes + .map(analyze) + .join('') + .trim(); function getText(node: TreeAdapter.Node): string { if (treeAdapter.isTextNode(node)) return node.value; @@ -43,60 +44,55 @@ export function fromHtml(html: string, quoteUri?: string | null): string { return ''; } - function appendChildren(childNodes: TreeAdapter.ChildNode[]): void { - if (childNodes.length > 0) { - for (const n of childNodes) { - analyze(n); - } - } + function analyzeMultiple(childNodes: TreeAdapter.ChildNode[]): string { + return childNodes.map(analyze).join(''); } - function analyze(node: TreeAdapter.Node): void { + function analyze(node: TreeAdapter.Node): string { if (treeAdapter.isTextNode(node)) { - text += node.value; - return; + return node.value; } // Skip comment or document type node - if (!treeAdapter.isElementNode(node)) return; + if (!treeAdapter.isElementNode(node)) return ''; switch (node.nodeName) { case 'br': { - text += '\n'; - break; + return '\n'; } case 'a': { + let text = ''; // trim spaces away, because some AP servers (app.wafrn.net) send strange // zero width non-break space in strange places and things like that - const txt = getText(node).trim(); + const linkText = getText(node).trim(); const href = getAttr(node, 'href'); // hashtags - if (txt.startsWith('#') && href && (attrHas(node, 'rel', 'tag') || attrHas(node, 'class', 'hashtag'))) { - text += txt; + if (linkText.startsWith('#') && href && (attrHas(node, 'rel', 'tag') || attrHas(node, 'class', 'hashtag'))) { + text += linkText; // mentions: a link that starts with `@` and does not include space - } else if (txt.startsWith('@') && txt.match(/\s/) == null && !attrHas(node, 'rel', 'me')) { - const part = txt.split('@'); + } else if (linkText.startsWith('@') && linkText.match(/\s/) == null && !attrHas(node, 'rel', 'me')) { + const part = linkText.split('@'); if (part.length === 2 && href) { // restore the host name part - const acct = `${txt}@${(new URL(href)).hostname}`; + const acct = `${linkText}@${(new URL(href)).hostname}`; text += acct; } else if (part.length === 3) { - text += txt; + text += linkText; } // other } else { const generateLink = () => { - if (!href && !txt) { + if (!href && !linkText) { return ''; } if (!href) { - return txt; + return linkText; } - if (!txt || txt === href) { // #6383: Missing text node + if (!linkText || linkText === href) { // #6383: Missing text node if (href.match(urlRegexFull)) { return href; } else { @@ -104,78 +100,57 @@ export function fromHtml(html: string, quoteUri?: string | null): string { } } if (href.match(urlRegex) && !href.match(urlRegexFull)) { - return `[${txt}](<${href}>)`; // #6846 + return `[${linkText}](<${href}>)`; // #6846 } else { - return `[${txt}](${href})`; + return `[${linkText}](${href})`; } }; text += generateLink(); } - break; + return text; } case 'h1': { - text += '【'; - appendChildren(node.childNodes); - text += '】\n'; - break; + return '【' + analyzeMultiple(node.childNodes) + '】\n'; } case 'b': case 'strong': { - text += '**'; - appendChildren(node.childNodes); - text += '**'; - break; + return '**' + analyzeMultiple(node.childNodes) + '**'; } case 'small': { - text += ''; - appendChildren(node.childNodes); - text += ''; - break; + return '' + analyzeMultiple(node.childNodes) + ''; } case 's': case 'del': { - text += '~~'; - appendChildren(node.childNodes); - text += '~~'; - break; + return '~~' + analyzeMultiple(node.childNodes) + '~~'; } case 'i': case 'em': { - text += ''; - appendChildren(node.childNodes); - text += ''; - break; + return '' + analyzeMultiple(node.childNodes) + ''; } // block code (
)
 			case 'pre': {
 				if (node.childNodes.length === 1 && node.childNodes[0].nodeName === 'code') {
-					text += '\n```\n';
-					text += getText(node.childNodes[0]);
-					text += '\n```\n';
+					return '\n```\n' + getText(node.childNodes[0]) + '\n```\n';
 				} else {
-					appendChildren(node.childNodes);
+					return analyzeMultiple(node.childNodes);
 				}
-				break;
 			}
 
 			// inline code ()
 			case 'code': {
-				text += '`';
-				appendChildren(node.childNodes);
-				text += '`';
-				break;
+				return '`' + analyzeMultiple(node.childNodes) + '`';
 			}
 
 			// inline or block KaTeX
@@ -207,31 +182,27 @@ export function fromHtml(html: string, quoteUri?: string | null): string {
 					// can be rendered as KaTeX, now decide if it is possible to render as inline or not
 					if (/[\r\n]/.test(formula)) {
 						// line break, this must be rendered as a block
-						text += '\n\\[' + formula + '\\]\n';
+						return '\n\\[' + formula + '\\]\n';
 					} else {
 						// render as inline
-						text += '\\(' + formula + '\\)';
+						return '\\(' + formula + '\\)';
 					}
 				} else {
 					// not KaTeX, but if there is a plaintext annotation it can still be rendered as code
 					if (/[\r\n]/.test(formula)) {
 						// line break, this must be rendered as a block
-						text += '\n```\n' + formula + '\n```\n';
+						return '\n```\n' + formula + '\n```\n';
 					} else {
 						// render as inline
-						text += '`' + formula + '`';
+						return '`' + formula + '`';
 					}
 				}
-				break;
 			}
 
 			case 'blockquote': {
-				const t = getText(node);
-				if (t) {
-					text += '\n> ';
-					text += t.split('\n').join('\n> ');
-				}
-				break;
+				return analyzeMultiple(node.childNodes)
+					.trim()
+					.replace(/^|\n/g, '\n>');
 			}
 
 			case 'p':
@@ -241,9 +212,50 @@ export function fromHtml(html: string, quoteUri?: string | null): string {
 			case 'h5':
 			case 'h6':
 			{
-				text += '\n\n';
-				appendChildren(node.childNodes);
-				break;
+				return '\n\n' + analyzeMultiple(node.childNodes);
+			}
+
+			// lists and list items
+			case 'ol':
+			case 'ul':
+			{
+				if (node.nodeName == 'ol') {
+					listIndex.push(1);
+				} else {
+					listIndex.push(0);
+				}
+				let text = '\n' + analyzeMultiple(node.childNodes);
+				listIndex.pop();
+				return text;
+			}
+
+			case 'li':
+			{
+				if (listIndex.length == 0) {
+					break;
+				}
+				let text = '\n';
+
+				// pop the current operating on index for manipulation
+				let index = listIndex.pop();
+				// indent the start of the list item respecitve of the level of
+				// nesting of lists
+				//
+				// since we popped the current index, the length will be 0 on
+				// the first level, thus causing no indent on the first level
+				text += '  '.repeat(listIndex.length);
+				if (index == 0) {
+					text += '- ';
+				} else {
+					text += index + ') ';
+					index++;
+				}
+				// done with the index, put it back so nested lists with
+				// analyzeMultiple will work correctly
+				listIndex.push(index);
+
+				text += analyzeMultiple(node.childNodes);
+				return text;
 			}
 
 			// other block elements
@@ -251,30 +263,28 @@ export function fromHtml(html: string, quoteUri?: string | null): string {
 			case 'header':
 			case 'footer':
 			case 'article':
-			case 'li':
 			case 'dt':
 			case 'dd':
 			{
-				text += '\n';
-				appendChildren(node.childNodes);
-				break;
+				return '\n' + analyzeMultiple(node.childNodes);
 			}
 
 			case 'span':
 			{
 				if (attrHas(node, 'class', 'quote-inline') && quoteUri && getText(node).trim() === `RE: ${quoteUri}`) {
 					// embedded quote thingy for backwards compatibility, don't show it
+					return '';
 				} else {
-					appendChildren(node.childNodes);
+					return analyzeMultiple(node.childNodes);
 				}
-				break;
 			}
 
 			default:	// includes inline elements
 			{
-				appendChildren(node.childNodes);
-				break;
+				return analyzeMultiple(node.childNodes);
 			}
 		}
+
+		return '';
 	}
 }