feat: 添加HTML解析方法

This commit is contained in:
pipipi-pikachu 2021-06-04 17:13:06 +08:00
parent 07c3b74ce3
commit a10704c0ba
7 changed files with 580 additions and 0 deletions

View File

@ -0,0 +1,47 @@
import { HTMLNode, CommentOrTextAST, ElementAST, AST } from './types'
export const splitHead = (str: string, sep: string) => {
const idx = str.indexOf(sep)
if (idx === -1) return [str]
return [str.slice(0, idx), str.slice(idx + sep.length)]
}
const unquote = (str: string) => {
const car = str.charAt(0)
const end = str.length - 1
const isQuoteStart = car === '"' || car === "'"
if (isQuoteStart && car === str.charAt(end)) {
return str.slice(1, end)
}
return str
}
const formatAttributes = (attributes: string[]) => {
return attributes.map(attribute => {
const parts = splitHead(attribute.trim(), '=')
const key = parts[0]
const value = typeof parts[1] === 'string' ? unquote(parts[1]) : null
return { key, value }
})
}
export const format = (nodes: HTMLNode[]): AST[] => {
return nodes.map(node => {
if (node.type === 'element') {
const children = format(node.children)
const item: ElementAST = {
type: 'element',
tagName: node.tagName.toLowerCase(),
attributes: formatAttributes(node.attributes),
children,
}
return item
}
const item: CommentOrTextAST = {
type: node.type,
content: node.content,
}
return item
})
}

View File

@ -0,0 +1,15 @@
// 参考https://github.com/andrejewski/himalaya 用TypeScript重写并简化部分功能
import { lexer } from './lexer'
import { parser } from './parser'
import { format } from './format'
import { toHTML } from './stringify'
import type { AST } from './types'
export const toAST = (str: string) => {
const tokens = lexer(str)
const nodes = parser(tokens)
return format(nodes)
}
export { toHTML, AST }

View File

@ -0,0 +1,276 @@
import startsWith from 'lodash/startsWith'
import endsWith from 'lodash/endsWith'
import { Token } from './types'
import { childlessTags } from './tags'
interface State {
str: string;
position: number;
tokens: Token[];
}
const jumpPosition = (state: State, end: number) => {
const len = end - state.position
movePositopn(state, len)
}
const movePositopn = (state: State, len: number) => {
state.position = state.position + len
}
const findTextEnd = (str: string, index: number) => {
const isEnd = false
while (!isEnd) {
const textEnd = str.indexOf('<', index)
if (textEnd === -1) {
return textEnd
}
const char = str.charAt(textEnd + 1)
if (char === '/' || char === '!' || /[A-Za-z0-9]/.test(char)) {
return textEnd
}
index = textEnd + 1
}
return -1
}
const lexText = (state: State) => {
const { str } = state
let textEnd = findTextEnd(str, state.position)
if (textEnd === state.position) return
if (textEnd === -1) {
textEnd = str.length
}
const content = str.slice(state.position, textEnd)
jumpPosition(state, textEnd)
state.tokens.push({
type: 'text',
content,
})
}
const lexComment = (state: State) => {
const { str } = state
movePositopn(state, 4)
let contentEnd = str.indexOf('-->', state.position)
let commentEnd = contentEnd + 3
if (contentEnd === -1) {
contentEnd = commentEnd = str.length
}
const content = str.slice(state.position, contentEnd)
jumpPosition(state, commentEnd)
state.tokens.push({
type: 'comment',
content,
})
}
const lexTagName = (state: State) => {
const { str } = state
const len = str.length
let start = state.position
while (start < len) {
const char = str.charAt(start)
const isTagChar = !(/\s/.test(char) || char === '/' || char === '>')
if (isTagChar) break
start++
}
let end = start + 1
while (end < len) {
const char = str.charAt(end)
const isTagChar = !(/\s/.test(char) || char === '/' || char === '>')
if (!isTagChar) break
end++
}
jumpPosition(state, end)
const tagName = str.slice(start, end)
state.tokens.push({
type: 'tag',
content: tagName
})
return tagName
}
const lexTagAttributes = (state: State) => {
const { str, tokens } = state
let cursor = state.position
let quote = null
let wordBegin = cursor
const words = []
const len = str.length
while (cursor < len) {
const char = str.charAt(cursor)
if (quote) {
const isQuoteEnd = char === quote
if (isQuoteEnd) quote = null
cursor++
continue
}
const isTagEnd = char === '/' || char === '>'
if (isTagEnd) {
if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor))
break
}
const isWordEnd = /\s/.test(char)
if (isWordEnd) {
if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor))
wordBegin = cursor + 1
cursor++
continue
}
const isQuoteStart = char === '\'' || char === '"'
if (isQuoteStart) {
quote = char
cursor++
continue
}
cursor++
}
jumpPosition(state, cursor)
const type = 'attribute'
for (let i = 0; i < words.length; i++) {
const word = words[i]
const isNotPair = word.indexOf('=') === -1
if (isNotPair) {
const secondWord = words[i + 1]
if (secondWord && startsWith(secondWord, '=')) {
if (secondWord.length > 1) {
const newWord = word + secondWord
tokens.push({ type, content: newWord })
i += 1
continue
}
const thirdWord = words[i + 2]
i += 1
if (thirdWord) {
const newWord = word + '=' + thirdWord
tokens.push({ type, content: newWord })
i += 1
continue
}
}
}
if (endsWith(word, '=')) {
const secondWord = words[i + 1]
if (secondWord && secondWord.indexOf('=') === -1) {
const newWord = word + secondWord
tokens.push({ type, content: newWord })
i += 1
continue
}
const newWord = word.slice(0, -1)
tokens.push({ type, content: newWord })
continue
}
tokens.push({ type, content: word })
}
}
const lexSkipTag = (tagName: string, state: State) => {
const { str, tokens } = state
const safeTagName = tagName.toLowerCase()
const len = str.length
let index = state.position
while (index < len) {
const nextTag = str.indexOf('</', index)
if (nextTag === -1) {
lexText(state)
break
}
const tagState = {
str,
position: state.position,
tokens: [],
}
jumpPosition(tagState, nextTag)
const name = lexTag(tagState)
if (safeTagName !== name.toLowerCase()) {
index = tagState.position
continue
}
if (nextTag !== state.position) {
const textStart = state.position
jumpPosition(state, nextTag)
tokens.push({
type: 'text',
content: str.slice(textStart, nextTag),
})
}
tokens.push(...tagState.tokens)
jumpPosition(state, tagState.position)
break
}
}
const lexTag = (state: State) => {
const { str } = state
const secondChar = str.charAt(state.position + 1)
const tagStartClose = secondChar === '/'
movePositopn(state, tagStartClose ? 2 : 1)
state.tokens.push({
type: 'tag-start',
close: tagStartClose,
})
const tagName = lexTagName(state)
lexTagAttributes(state)
const firstChar = str.charAt(state.position)
const tagEndClose = firstChar === '/'
movePositopn(state, tagEndClose ? 2 : 1)
state.tokens.push({
type: 'tag-end',
close: tagEndClose,
})
return tagName
}
const lex = (state: State) => {
const str = state.str
const len = str.length
while (state.position < len) {
const start = state.position
lexText(state)
if (state.position === start) {
const isComment = startsWith(str, '!--', start + 1)
if (isComment) lexComment(state)
else {
const tagName = lexTag(state)
const safeTag = tagName.toLowerCase()
if (childlessTags.includes(safeTag)) lexSkipTag(tagName, state)
}
}
}
}
export const lexer = (str: string): Token[] => {
const state = {
str,
position: 0,
tokens: [],
}
lex(state)
return state.tokens
}

View File

@ -0,0 +1,129 @@
import { Token, HTMLNode, TagToken, NormalElement, TagEndToken, AttributeToken, TextToken } from './types'
import { closingTags, closingTagAncestorBreakers, voidTags } from './tags'
interface StackItem {
tagName: string | null;
children: HTMLNode[];
}
interface State {
stack: StackItem[];
cursor: number;
tokens: Token[];
}
export const parser = (tokens: Token[]) => {
const root: StackItem = { tagName: null, children: [] }
const state: State = { tokens, cursor: 0, stack: [root] }
parse(state)
return root.children
}
export const hasTerminalParent = (tagName: string, stack: StackItem[]) => {
const tagParents = closingTagAncestorBreakers[tagName]
if (tagParents) {
let currentIndex = stack.length - 1
while (currentIndex >= 0) {
const parentTagName = stack[currentIndex].tagName
if (parentTagName === tagName) break
if (tagParents.includes(parentTagName)) return true
currentIndex--
}
}
return false
}
export const rewindStack = (stack: StackItem[], newLength: number) => {
stack.splice(newLength)
}
export const parse = (state: State) => {
const { stack, tokens } = state
let { cursor } = state
let nodes = stack[stack.length - 1].children
const len = tokens.length
while (cursor < len) {
const token = tokens[cursor]
if (token.type !== 'tag-start') {
nodes.push(token as TextToken)
cursor++
continue
}
const tagToken = tokens[++cursor] as TagToken
cursor++
const tagName = tagToken.content.toLowerCase()
if (token.close) {
let index = stack.length
let shouldRewind = false
while (--index > -1) {
if (stack[index].tagName === tagName) {
shouldRewind = true
break
}
}
while (cursor < len) {
if (tokens[cursor].type !== 'tag-end') break
cursor++
}
if (shouldRewind) {
rewindStack(stack, index)
break
}
else continue
}
const isClosingTag = closingTags.includes(tagName)
let shouldRewindToAutoClose = isClosingTag
if (shouldRewindToAutoClose) {
shouldRewindToAutoClose = !hasTerminalParent(tagName, stack)
}
if (shouldRewindToAutoClose) {
let currentIndex = stack.length - 1
while (currentIndex > 0) {
if (tagName === stack[currentIndex].tagName) {
rewindStack(stack, currentIndex)
const previousIndex = currentIndex - 1
nodes = stack[previousIndex].children
break
}
currentIndex = currentIndex - 1
}
}
const attributes = []
let tagEndToken: TagEndToken | undefined
while (cursor < len) {
const _token = tokens[cursor]
if (_token.type === 'tag-end') {
tagEndToken = _token
break
}
attributes.push((_token as AttributeToken).content)
cursor++
}
if (!tagEndToken) break
cursor++
const children: HTMLNode[] = []
const elementNode: NormalElement = {
type: 'element',
tagName: tagToken.content,
attributes,
children,
}
nodes.push(elementNode)
const hasChildren = !(tagEndToken.close || voidTags.includes(tagName))
if (hasChildren) {
stack.push({tagName, children})
const innerState = { tokens, cursor, stack }
parse(innerState)
cursor = innerState.cursor
}
}
state.cursor = cursor
}

View File

@ -0,0 +1,28 @@
import { AST, ElementAST, ElementAttribute } from './types'
import { voidTags } from './tags'
export const formatAttributes = (attributes: ElementAttribute[]) => {
return attributes.reduce((attrs, attribute) => {
const { key, value } = attribute
if (value === null) return `${attrs} ${key}`
if (key === 'style' && !value) return ''
const quoteEscape = value.indexOf('\'') !== -1
const quote = quoteEscape ? '"' : '\''
return `${attrs} ${key}=${quote}${value}${quote}`
}, '')
}
export const toHTML = (tree: AST[]) => {
const htmlStrings: string[] = tree.map(node => {
if (node.type === 'text') return node.content
if (node.type === 'comment') return `<!--${node.content}-->`
const { tagName, attributes, children } = node as ElementAST
const isSelfClosing = voidTags.includes(tagName.toLowerCase())
if (isSelfClosing) return `<${tagName}${formatAttributes(attributes)}>`
return `<${tagName}${formatAttributes(attributes)}>${toHTML(children)}</${tagName}>`
})
return htmlStrings.join('')
}

View File

@ -0,0 +1,16 @@
export const childlessTags = ['style', 'script', 'template']
export const closingTags = ['html', 'head', 'body', 'p', 'dt', 'dd', 'li', 'option', 'thead', 'th', 'tbody', 'tr', 'td', 'tfoot', 'colgroup']
export const closingTagAncestorBreakers = {
li: ['ul', 'ol', 'menu'],
dt: ['dl'],
dd: ['dl'],
tbody: ['table'],
thead: ['table'],
tfoot: ['table'],
tr: ['table'],
td: ['table'],
}
export const voidTags = ['!doctype', 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']

View File

@ -0,0 +1,69 @@
export interface ElementAttribute {
key: string;
value: string | null;
}
export interface CommentElement {
type: 'comment';
content: string;
}
export interface TextElement {
type: 'text';
content: string;
}
export interface NormalElement {
type: 'element';
tagName: string;
children: HTMLNode[];
attributes: string[];
}
export type HTMLNode = CommentElement | TextElement | NormalElement
export interface ElementAST {
type: 'element';
tagName: string;
children: AST[];
attributes: ElementAttribute[];
}
export interface CommentOrTextAST {
type: 'comment' | 'text';
content: string;
}
export type AST = CommentOrTextAST | ElementAST
export interface TagStartToken {
type: 'tag-start';
close: boolean;
}
export interface TagEndToken {
type: 'tag-end';
close: boolean;
}
export interface TagToken {
type: 'tag';
content: string;
}
export interface TextToken {
type: 'text';
content: string;
}
export interface CommentToken {
type: 'comment';
content: string;
}
export interface AttributeToken {
type: 'attribute';
content: string;
}
export type Token = TagStartToken | TagEndToken | TagToken | TextToken | CommentToken | AttributeToken