2021-06-12 16:47:23 +00:00
|
|
|
import { getTagName } from './utility.service.js'
|
|
|
|
|
2019-11-13 22:18:14 +00:00
|
|
|
/**
|
2021-06-10 15:52:01 +00:00
|
|
|
* This is a tiny purpose-built HTML parser/processor. This basically detects
|
|
|
|
* any type of visual newline and converts entire HTML into a array structure.
|
|
|
|
*
|
|
|
|
* Text nodes are represented as object with single property - text - containing
|
|
|
|
* the visual line. Intended usage is to process the array with .map() in which
|
|
|
|
* map function returns a string and resulting array can be converted back to html
|
|
|
|
* with a .join('').
|
|
|
|
*
|
|
|
|
* Generally this isn't very useful except for when you really need to either
|
|
|
|
* modify visual lines (greentext i.e. simple quoting) or do something with
|
|
|
|
* first/last line.
|
2019-11-13 22:18:14 +00:00
|
|
|
*
|
2019-11-14 20:40:20 +00:00
|
|
|
* known issue: doesn't handle CDATA so nested CDATA might not work well
|
|
|
|
*
|
2019-11-13 22:18:14 +00:00
|
|
|
* @param {Object} input - input data
|
2021-06-10 15:52:01 +00:00
|
|
|
* @return {(string|{ text: string })[]} processed html in form of a list.
|
2019-11-13 22:18:14 +00:00
|
|
|
*/
|
2021-06-10 15:52:01 +00:00
|
|
|
export const convertHtmlToLines = (html) => {
|
2021-06-13 10:29:26 +00:00
|
|
|
// Elements that are implicitly self-closing
|
|
|
|
// https://developer.mozilla.org/en-US/docs/Glossary/empty_element
|
|
|
|
const emptyElements = new Set([
|
|
|
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
|
|
|
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
|
|
|
|
])
|
|
|
|
// Block-level element (they make a visual line)
|
|
|
|
// https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
|
|
|
|
const blockElements = new Set([
|
|
|
|
'address', 'article', 'aside', 'blockquote', 'details', 'dialog', 'dd',
|
|
|
|
'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
|
|
|
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'main',
|
|
|
|
'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'
|
|
|
|
])
|
|
|
|
// br is very weird in a way that it's technically not block-level, it's
|
|
|
|
// essentially converted to a \n (or \r\n). There's also wbr but it doesn't
|
|
|
|
// guarantee linebreak, only suggest it.
|
|
|
|
const linebreakElements = new Set(['br'])
|
|
|
|
|
|
|
|
const visualLineElements = new Set([
|
|
|
|
...blockElements.values(),
|
|
|
|
...linebreakElements.values()
|
|
|
|
])
|
|
|
|
|
|
|
|
// All block-level elements that aren't empty elements, i.e. not <hr>
|
|
|
|
const nonEmptyElements = new Set(visualLineElements)
|
|
|
|
// Difference
|
|
|
|
for (let elem of emptyElements) {
|
|
|
|
nonEmptyElements.delete(elem)
|
|
|
|
}
|
|
|
|
|
|
|
|
// All elements that we are recognizing
|
|
|
|
const allElements = new Set([
|
|
|
|
...nonEmptyElements.values(),
|
|
|
|
...emptyElements.values()
|
|
|
|
])
|
2019-11-13 22:18:14 +00:00
|
|
|
|
2021-06-10 15:52:01 +00:00
|
|
|
let buffer = [] // Current output buffer
|
2019-11-13 22:18:14 +00:00
|
|
|
const level = [] // How deep we are in tags and which tags were there
|
|
|
|
let textBuffer = '' // Current line content
|
|
|
|
let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
|
|
|
|
|
|
|
|
const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
|
2021-06-13 10:29:26 +00:00
|
|
|
if (textBuffer.trim().length > 0) {
|
|
|
|
buffer.push({ level: [...level], text: textBuffer })
|
2019-11-14 20:40:20 +00:00
|
|
|
} else {
|
2021-06-10 15:52:01 +00:00
|
|
|
buffer.push(textBuffer)
|
2019-11-14 20:40:20 +00:00
|
|
|
}
|
2019-11-13 22:18:14 +00:00
|
|
|
textBuffer = ''
|
|
|
|
}
|
|
|
|
|
2019-11-14 20:40:20 +00:00
|
|
|
const handleBr = (tag) => { // handles single newlines/linebreaks/selfclosing
|
2019-11-13 22:18:14 +00:00
|
|
|
flush()
|
2021-06-10 15:52:01 +00:00
|
|
|
buffer.push(tag)
|
2019-11-13 22:18:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
const handleOpen = (tag) => { // handles opening tags
|
|
|
|
flush()
|
2021-06-10 15:52:01 +00:00
|
|
|
buffer.push(tag)
|
2021-06-11 08:50:05 +00:00
|
|
|
level.unshift(getTagName(tag))
|
2019-11-13 22:18:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
const handleClose = (tag) => { // handles closing tags
|
2021-06-11 08:50:05 +00:00
|
|
|
if (level[0] === getTagName(tag)) {
|
2021-06-13 10:29:26 +00:00
|
|
|
flush()
|
|
|
|
buffer.push(tag)
|
2021-06-11 08:50:05 +00:00
|
|
|
level.shift()
|
2021-06-13 10:29:26 +00:00
|
|
|
} else { // Broken case
|
|
|
|
textBuffer += tag
|
2019-11-13 22:18:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (let i = 0; i < html.length; i++) {
|
|
|
|
const char = html[i]
|
2019-11-13 22:41:14 +00:00
|
|
|
if (char === '<' && tagBuffer === null) {
|
2019-11-13 22:18:14 +00:00
|
|
|
tagBuffer = char
|
|
|
|
} else if (char !== '>' && tagBuffer !== null) {
|
|
|
|
tagBuffer += char
|
|
|
|
} else if (char === '>' && tagBuffer !== null) {
|
|
|
|
tagBuffer += char
|
2019-11-13 22:41:14 +00:00
|
|
|
const tagFull = tagBuffer
|
|
|
|
tagBuffer = null
|
|
|
|
const tagName = getTagName(tagFull)
|
2021-06-13 10:29:26 +00:00
|
|
|
if (allElements.has(tagName)) {
|
|
|
|
if (linebreakElements.has(tagName)) {
|
2019-11-13 22:41:14 +00:00
|
|
|
handleBr(tagFull)
|
2021-06-13 10:29:26 +00:00
|
|
|
} else if (nonEmptyElements.has(tagName)) {
|
2019-11-13 22:41:14 +00:00
|
|
|
if (tagFull[1] === '/') {
|
|
|
|
handleClose(tagFull)
|
2019-11-14 20:40:20 +00:00
|
|
|
} else if (tagFull[tagFull.length - 2] === '/') {
|
|
|
|
// self-closing
|
|
|
|
handleBr(tagFull)
|
2019-11-13 22:18:14 +00:00
|
|
|
} else {
|
2019-11-13 22:41:14 +00:00
|
|
|
handleOpen(tagFull)
|
2019-11-13 22:18:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2019-11-13 22:41:14 +00:00
|
|
|
textBuffer += tagFull
|
2019-11-13 22:18:14 +00:00
|
|
|
}
|
|
|
|
} else if (char === '\n') {
|
|
|
|
handleBr(char)
|
|
|
|
} else {
|
|
|
|
textBuffer += char
|
|
|
|
}
|
|
|
|
}
|
2019-11-14 20:40:20 +00:00
|
|
|
if (tagBuffer) {
|
|
|
|
textBuffer += tagBuffer
|
|
|
|
}
|
2019-11-13 22:18:14 +00:00
|
|
|
|
|
|
|
flush()
|
|
|
|
|
|
|
|
return buffer
|
|
|
|
}
|