From bebafa1a2c38972245d37de70f4aec4bfb2083fd Mon Sep 17 00:00:00 2001 From: Henry Jameson Date: Sun, 13 Jun 2021 13:29:26 +0300 Subject: [PATCH] refactored line converter, untied its logic from greentexting, better handling of broken cases --- src/components/rich_content/rich_content.jsx | 10 +++- .../html_line_converter.service.js | 55 +++++++++++++++---- .../html_line_converter.spec.js | 17 ++++-- 3 files changed, 67 insertions(+), 15 deletions(-) diff --git a/src/components/rich_content/rich_content.jsx b/src/components/rich_content/rich_content.jsx index e188763f..328e9201 100644 --- a/src/components/rich_content/rich_content.jsx +++ b/src/components/rich_content/rich_content.jsx @@ -246,6 +246,7 @@ const getLinkData = (attrs, children, index) => { */ export const preProcessPerLine = (html, greentext, handleLinks) => { const lastMentions = [] + const greentextHandle = new Set(['p', 'div']) let nonEmptyIndex = -1 const newHtml = convertHtmlToLines(html).reverse().map((item, index, array) => { @@ -256,7 +257,14 @@ export const preProcessPerLine = (html, greentext, handleLinks) => { nonEmptyIndex += 1 // Greentext stuff - if (greentext && (string.includes('>') || string.includes('<'))) { + if ( + // Only if greentext is engaged + greentext && + // Only handle p's and divs. Don't want to affect blocquotes, code etc + item.level.every(l => greentextHandle.has(l)) && + // Only if line begins with '>' or '<' + (string.includes('>') || string.includes('<')) + ) { const cleanedString = string.replace(/<[^>]+?>/gi, '') // remove all tags .replace(/@\w+/gi, '') // remove mentions (even failed ones) .trim() diff --git a/src/services/html_converter/html_line_converter.service.js b/src/services/html_converter/html_line_converter.service.js index e448d5cd..f43d162a 100644 --- a/src/services/html_converter/html_line_converter.service.js +++ b/src/services/html_converter/html_line_converter.service.js @@ -19,9 +19,42 @@ import { getTagName } from './utility.service.js' * @return {(string|{ text: string })[]} processed html in form of a list. */ export const convertHtmlToLines = (html) => { - const ignoredTags = new Set(['code', 'blockquote']) - const handledTags = new Set(['p', 'br', 'div', 'pre', 'code', 'blockquote']) - const openCloseTags = new Set(['p', 'div', 'pre', 'code', 'blockquote']) + // Elements that are implicitly self-closing + // https://developer.mozilla.org/en-US/docs/Glossary/empty_element + const emptyElements = new Set([ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', + 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' + ]) + // Block-level element (they make a visual line) + // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements + const blockElements = new Set([ + 'address', 'article', 'aside', 'blockquote', 'details', 'dialog', 'dd', + 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'main', + 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul' + ]) + // br is very weird in a way that it's technically not block-level, it's + // essentially converted to a \n (or \r\n). There's also wbr but it doesn't + // guarantee linebreak, only suggest it. + const linebreakElements = new Set(['br']) + + const visualLineElements = new Set([ + ...blockElements.values(), + ...linebreakElements.values() + ]) + + // All block-level elements that aren't empty elements, i.e. not
+ const nonEmptyElements = new Set(visualLineElements) + // Difference + for (let elem of emptyElements) { + nonEmptyElements.delete(elem) + } + + // All elements that we are recognizing + const allElements = new Set([ + ...nonEmptyElements.values(), + ...emptyElements.values() + ]) let buffer = [] // Current output buffer const level = [] // How deep we are in tags and which tags were there @@ -29,8 +62,8 @@ export const convertHtmlToLines = (html) => { let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer - if (textBuffer.trim().length > 0 && !level.some(l => ignoredTags.has(l))) { - buffer.push({ text: textBuffer }) + if (textBuffer.trim().length > 0) { + buffer.push({ level: [...level], text: textBuffer }) } else { buffer.push(textBuffer) } @@ -49,10 +82,12 @@ export const convertHtmlToLines = (html) => { } const handleClose = (tag) => { // handles closing tags - flush() - buffer.push(tag) if (level[0] === getTagName(tag)) { + flush() + buffer.push(tag) level.shift() + } else { // Broken case + textBuffer += tag } } @@ -67,10 +102,10 @@ export const convertHtmlToLines = (html) => { const tagFull = tagBuffer tagBuffer = null const tagName = getTagName(tagFull) - if (handledTags.has(tagName)) { - if (tagName === 'br') { + if (allElements.has(tagName)) { + if (linebreakElements.has(tagName)) { handleBr(tagFull) - } else if (openCloseTags.has(tagName)) { + } else if (nonEmptyElements.has(tagName)) { if (tagFull[1] === '/') { handleClose(tagFull) } else if (tagFull[tagFull.length - 2] === '/') { diff --git a/test/unit/specs/services/html_converter/html_line_converter.spec.js b/test/unit/specs/services/html_converter/html_line_converter.spec.js index 9485233f..c8c89700 100644 --- a/test/unit/specs/services/html_converter/html_line_converter.spec.js +++ b/test/unit/specs/services/html_converter/html_line_converter.spec.js @@ -1,8 +1,17 @@ import { convertHtmlToLines } from 'src/services/html_converter/html_line_converter.service.js' -const mapOnlyText = (processor) => (input) => input.text ? processor(input.text) : input +const greentextHandle = new Set(['p', 'div']) +const mapOnlyText = (processor) => (input) => { + if (input.text && input.level.every(l => greentextHandle.has(l))) { + return processor(input.text) + } else if (input.text) { + return input.text + } else { + return input + } +} -describe('html_line_converter', () => { +describe.only('html_line_converter', () => { describe('with processor that keeps original line should not make any changes to HTML when', () => { const processorKeep = (line) => line it('fed with regular HTML with newlines', () => { @@ -81,7 +90,7 @@ describe('html_line_converter', () => { it('fed with very broken HTML with broken composition', () => { const input = '

lmao what whats going on
wha

' - const output = '

_
_
_

' + const output = '_

_

' const result = convertHtmlToLines(input) const comparableResult = result.map(mapOnlyText(processorReplace)).join('') expect(comparableResult).to.eql(output) @@ -111,7 +120,7 @@ describe('html_line_converter', () => { expect(comparableResult).to.eql(output) }) - it('fed with maybe valid HTML? self-closing divs and ps', () => { + it('fed with maybe valid HTML? (XHTML) self-closing divs and ps', () => { const input = 'a

what now

?' const output = '_

_

_' const result = convertHtmlToLines(input)