2021-06-12 16:47:23 +00:00
|
|
|
import { getTagName } from './utility.service.js'
|
|
|
|
|
2021-06-07 00:14:48 +00:00
|
|
|
/**
|
2021-06-10 15:52:01 +00:00
|
|
|
* This is a not-so-tiny purpose-built HTML parser/processor. This parses html
|
|
|
|
* and converts it into a tree structure representing tag openers/closers and
|
|
|
|
* children.
|
2021-06-07 00:14:48 +00:00
|
|
|
*
|
2021-06-10 15:52:01 +00:00
|
|
|
* Structure follows this pattern: [opener, [...children], closer] except root
|
|
|
|
* node which is just [...children]. Text nodes can only be within children and
|
|
|
|
* are represented as strings.
|
|
|
|
*
|
|
|
|
* Intended use is to convert HTML structure and then recursively iterate over it
|
|
|
|
* most likely using a map. Very useful for dynamically rendering html replacing
|
|
|
|
* tags with JSX elements in a render function.
|
|
|
|
*
|
|
|
|
* known issue: doesn't handle CDATA so CDATA might not work well
|
|
|
|
* known issue: doesn't handle HTML comments
|
2021-06-07 00:14:48 +00:00
|
|
|
*
|
|
|
|
* @param {Object} input - input data
|
|
|
|
* @return {string} processed html
|
|
|
|
*/
|
2021-08-14 23:59:14 +00:00
|
|
|
export const convertHtmlToTree = (html = '') => {
|
2021-06-07 00:14:48 +00:00
|
|
|
// Elements that are implicitly self-closing
|
|
|
|
// https://developer.mozilla.org/en-US/docs/Glossary/empty_element
|
|
|
|
const emptyElements = new Set([
|
|
|
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
|
|
|
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
|
|
|
|
])
|
|
|
|
// TODO For future - also parse HTML5 multi-source components?
|
|
|
|
|
|
|
|
const buffer = [] // Current output buffer
|
|
|
|
const levels = [['', buffer]] // How deep we are in tags and which tags were there
|
|
|
|
let textBuffer = '' // Current line content
|
|
|
|
let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
|
|
|
|
|
|
|
|
const getCurrentBuffer = () => {
|
|
|
|
return levels[levels.length - 1][1]
|
|
|
|
}
|
|
|
|
|
|
|
|
const flushText = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
|
|
|
|
if (textBuffer === '') return
|
|
|
|
getCurrentBuffer().push(textBuffer)
|
|
|
|
textBuffer = ''
|
|
|
|
}
|
|
|
|
|
|
|
|
const handleSelfClosing = (tag) => {
|
|
|
|
getCurrentBuffer().push([tag])
|
|
|
|
}
|
|
|
|
|
|
|
|
const handleOpen = (tag) => {
|
|
|
|
const curBuf = getCurrentBuffer()
|
|
|
|
const newLevel = [tag, []]
|
|
|
|
levels.push(newLevel)
|
|
|
|
curBuf.push(newLevel)
|
|
|
|
}
|
|
|
|
|
|
|
|
const handleClose = (tag) => {
|
|
|
|
const currentTag = levels[levels.length - 1]
|
|
|
|
if (getTagName(levels[levels.length - 1][0]) === getTagName(tag)) {
|
|
|
|
currentTag.push(tag)
|
|
|
|
levels.pop()
|
|
|
|
} else {
|
|
|
|
getCurrentBuffer().push(tag)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (let i = 0; i < html.length; i++) {
|
|
|
|
const char = html[i]
|
|
|
|
if (char === '<' && tagBuffer === null) {
|
|
|
|
flushText()
|
|
|
|
tagBuffer = char
|
|
|
|
} else if (char !== '>' && tagBuffer !== null) {
|
|
|
|
tagBuffer += char
|
|
|
|
} else if (char === '>' && tagBuffer !== null) {
|
|
|
|
tagBuffer += char
|
|
|
|
const tagFull = tagBuffer
|
|
|
|
tagBuffer = null
|
|
|
|
const tagName = getTagName(tagFull)
|
|
|
|
if (tagFull[1] === '/') {
|
|
|
|
handleClose(tagFull)
|
|
|
|
} else if (emptyElements.has(tagName) || tagFull[tagFull.length - 2] === '/') {
|
|
|
|
// self-closing
|
|
|
|
handleSelfClosing(tagFull)
|
|
|
|
} else {
|
|
|
|
handleOpen(tagFull)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
textBuffer += char
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (tagBuffer) {
|
|
|
|
textBuffer += tagBuffer
|
|
|
|
}
|
|
|
|
|
|
|
|
flushText()
|
|
|
|
return buffer
|
|
|
|
}
|