mirror of
https://github.com/pipipi-pikachu/PPTist.git
synced 2025-04-15 02:20:00 +08:00
feat: 添加HTML解析方法
This commit is contained in:
parent
07c3b74ce3
commit
a10704c0ba
47
src/utils/htmlParser/format.ts
Normal file
47
src/utils/htmlParser/format.ts
Normal file
@ -0,0 +1,47 @@
|
||||
import { HTMLNode, CommentOrTextAST, ElementAST, AST } from './types'
|
||||
|
||||
export const splitHead = (str: string, sep: string) => {
|
||||
const idx = str.indexOf(sep)
|
||||
if (idx === -1) return [str]
|
||||
return [str.slice(0, idx), str.slice(idx + sep.length)]
|
||||
}
|
||||
|
||||
const unquote = (str: string) => {
|
||||
const car = str.charAt(0)
|
||||
const end = str.length - 1
|
||||
const isQuoteStart = car === '"' || car === "'"
|
||||
if (isQuoteStart && car === str.charAt(end)) {
|
||||
return str.slice(1, end)
|
||||
}
|
||||
return str
|
||||
}
|
||||
|
||||
const formatAttributes = (attributes: string[]) => {
|
||||
return attributes.map(attribute => {
|
||||
const parts = splitHead(attribute.trim(), '=')
|
||||
const key = parts[0]
|
||||
const value = typeof parts[1] === 'string' ? unquote(parts[1]) : null
|
||||
return { key, value }
|
||||
})
|
||||
}
|
||||
|
||||
export const format = (nodes: HTMLNode[]): AST[] => {
|
||||
return nodes.map(node => {
|
||||
if (node.type === 'element') {
|
||||
const children = format(node.children)
|
||||
const item: ElementAST = {
|
||||
type: 'element',
|
||||
tagName: node.tagName.toLowerCase(),
|
||||
attributes: formatAttributes(node.attributes),
|
||||
children,
|
||||
}
|
||||
return item
|
||||
}
|
||||
|
||||
const item: CommentOrTextAST = {
|
||||
type: node.type,
|
||||
content: node.content,
|
||||
}
|
||||
return item
|
||||
})
|
||||
}
|
15
src/utils/htmlParser/index.ts
Normal file
15
src/utils/htmlParser/index.ts
Normal file
@ -0,0 +1,15 @@
|
||||
// 参考:https://github.com/andrejewski/himalaya 用TypeScript重写并简化部分功能
|
||||
|
||||
import { lexer } from './lexer'
|
||||
import { parser } from './parser'
|
||||
import { format } from './format'
|
||||
import { toHTML } from './stringify'
|
||||
import type { AST } from './types'
|
||||
|
||||
export const toAST = (str: string) => {
|
||||
const tokens = lexer(str)
|
||||
const nodes = parser(tokens)
|
||||
return format(nodes)
|
||||
}
|
||||
|
||||
export { toHTML, AST }
|
276
src/utils/htmlParser/lexer.ts
Normal file
276
src/utils/htmlParser/lexer.ts
Normal file
@ -0,0 +1,276 @@
|
||||
import startsWith from 'lodash/startsWith'
|
||||
import endsWith from 'lodash/endsWith'
|
||||
import { Token } from './types'
|
||||
import { childlessTags } from './tags'
|
||||
|
||||
interface State {
|
||||
str: string;
|
||||
position: number;
|
||||
tokens: Token[];
|
||||
}
|
||||
|
||||
const jumpPosition = (state: State, end: number) => {
|
||||
const len = end - state.position
|
||||
movePositopn(state, len)
|
||||
}
|
||||
|
||||
const movePositopn = (state: State, len: number) => {
|
||||
state.position = state.position + len
|
||||
}
|
||||
|
||||
const findTextEnd = (str: string, index: number) => {
|
||||
const isEnd = false
|
||||
while (!isEnd) {
|
||||
const textEnd = str.indexOf('<', index)
|
||||
if (textEnd === -1) {
|
||||
return textEnd
|
||||
}
|
||||
const char = str.charAt(textEnd + 1)
|
||||
if (char === '/' || char === '!' || /[A-Za-z0-9]/.test(char)) {
|
||||
return textEnd
|
||||
}
|
||||
index = textEnd + 1
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
const lexText = (state: State) => {
|
||||
const { str } = state
|
||||
let textEnd = findTextEnd(str, state.position)
|
||||
if (textEnd === state.position) return
|
||||
if (textEnd === -1) {
|
||||
textEnd = str.length
|
||||
}
|
||||
|
||||
const content = str.slice(state.position, textEnd)
|
||||
jumpPosition(state, textEnd)
|
||||
|
||||
state.tokens.push({
|
||||
type: 'text',
|
||||
content,
|
||||
})
|
||||
}
|
||||
|
||||
const lexComment = (state: State) => {
|
||||
const { str } = state
|
||||
|
||||
movePositopn(state, 4)
|
||||
let contentEnd = str.indexOf('-->', state.position)
|
||||
let commentEnd = contentEnd + 3
|
||||
if (contentEnd === -1) {
|
||||
contentEnd = commentEnd = str.length
|
||||
}
|
||||
|
||||
const content = str.slice(state.position, contentEnd)
|
||||
jumpPosition(state, commentEnd)
|
||||
|
||||
state.tokens.push({
|
||||
type: 'comment',
|
||||
content,
|
||||
})
|
||||
}
|
||||
|
||||
const lexTagName = (state: State) => {
|
||||
const { str } = state
|
||||
const len = str.length
|
||||
let start = state.position
|
||||
|
||||
while (start < len) {
|
||||
const char = str.charAt(start)
|
||||
const isTagChar = !(/\s/.test(char) || char === '/' || char === '>')
|
||||
if (isTagChar) break
|
||||
start++
|
||||
}
|
||||
|
||||
let end = start + 1
|
||||
while (end < len) {
|
||||
const char = str.charAt(end)
|
||||
const isTagChar = !(/\s/.test(char) || char === '/' || char === '>')
|
||||
if (!isTagChar) break
|
||||
end++
|
||||
}
|
||||
|
||||
jumpPosition(state, end)
|
||||
const tagName = str.slice(start, end)
|
||||
state.tokens.push({
|
||||
type: 'tag',
|
||||
content: tagName
|
||||
})
|
||||
return tagName
|
||||
}
|
||||
|
||||
const lexTagAttributes = (state: State) => {
|
||||
const { str, tokens } = state
|
||||
let cursor = state.position
|
||||
let quote = null
|
||||
let wordBegin = cursor
|
||||
const words = []
|
||||
const len = str.length
|
||||
while (cursor < len) {
|
||||
const char = str.charAt(cursor)
|
||||
if (quote) {
|
||||
const isQuoteEnd = char === quote
|
||||
if (isQuoteEnd) quote = null
|
||||
cursor++
|
||||
continue
|
||||
}
|
||||
|
||||
const isTagEnd = char === '/' || char === '>'
|
||||
if (isTagEnd) {
|
||||
if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor))
|
||||
break
|
||||
}
|
||||
|
||||
const isWordEnd = /\s/.test(char)
|
||||
if (isWordEnd) {
|
||||
if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor))
|
||||
wordBegin = cursor + 1
|
||||
cursor++
|
||||
continue
|
||||
}
|
||||
|
||||
const isQuoteStart = char === '\'' || char === '"'
|
||||
if (isQuoteStart) {
|
||||
quote = char
|
||||
cursor++
|
||||
continue
|
||||
}
|
||||
|
||||
cursor++
|
||||
}
|
||||
jumpPosition(state, cursor)
|
||||
|
||||
const type = 'attribute'
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
const word = words[i]
|
||||
|
||||
const isNotPair = word.indexOf('=') === -1
|
||||
if (isNotPair) {
|
||||
const secondWord = words[i + 1]
|
||||
if (secondWord && startsWith(secondWord, '=')) {
|
||||
if (secondWord.length > 1) {
|
||||
const newWord = word + secondWord
|
||||
tokens.push({ type, content: newWord })
|
||||
i += 1
|
||||
continue
|
||||
}
|
||||
const thirdWord = words[i + 2]
|
||||
i += 1
|
||||
if (thirdWord) {
|
||||
const newWord = word + '=' + thirdWord
|
||||
tokens.push({ type, content: newWord })
|
||||
i += 1
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
if (endsWith(word, '=')) {
|
||||
const secondWord = words[i + 1]
|
||||
if (secondWord && secondWord.indexOf('=') === -1) {
|
||||
const newWord = word + secondWord
|
||||
tokens.push({ type, content: newWord })
|
||||
i += 1
|
||||
continue
|
||||
}
|
||||
|
||||
const newWord = word.slice(0, -1)
|
||||
tokens.push({ type, content: newWord })
|
||||
continue
|
||||
}
|
||||
|
||||
tokens.push({ type, content: word })
|
||||
}
|
||||
}
|
||||
|
||||
const lexSkipTag = (tagName: string, state: State) => {
|
||||
const { str, tokens } = state
|
||||
const safeTagName = tagName.toLowerCase()
|
||||
const len = str.length
|
||||
let index = state.position
|
||||
|
||||
while (index < len) {
|
||||
const nextTag = str.indexOf('</', index)
|
||||
if (nextTag === -1) {
|
||||
lexText(state)
|
||||
break
|
||||
}
|
||||
|
||||
const tagState = {
|
||||
str,
|
||||
position: state.position,
|
||||
tokens: [],
|
||||
}
|
||||
jumpPosition(tagState, nextTag)
|
||||
const name = lexTag(tagState)
|
||||
if (safeTagName !== name.toLowerCase()) {
|
||||
index = tagState.position
|
||||
continue
|
||||
}
|
||||
|
||||
if (nextTag !== state.position) {
|
||||
const textStart = state.position
|
||||
jumpPosition(state, nextTag)
|
||||
tokens.push({
|
||||
type: 'text',
|
||||
content: str.slice(textStart, nextTag),
|
||||
})
|
||||
}
|
||||
|
||||
tokens.push(...tagState.tokens)
|
||||
jumpPosition(state, tagState.position)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
const lexTag = (state: State) => {
|
||||
const { str } = state
|
||||
const secondChar = str.charAt(state.position + 1)
|
||||
const tagStartClose = secondChar === '/'
|
||||
movePositopn(state, tagStartClose ? 2 : 1)
|
||||
state.tokens.push({
|
||||
type: 'tag-start',
|
||||
close: tagStartClose,
|
||||
})
|
||||
|
||||
const tagName = lexTagName(state)
|
||||
lexTagAttributes(state)
|
||||
|
||||
const firstChar = str.charAt(state.position)
|
||||
const tagEndClose = firstChar === '/'
|
||||
movePositopn(state, tagEndClose ? 2 : 1)
|
||||
state.tokens.push({
|
||||
type: 'tag-end',
|
||||
close: tagEndClose,
|
||||
})
|
||||
return tagName
|
||||
}
|
||||
|
||||
const lex = (state: State) => {
|
||||
const str = state.str
|
||||
const len = str.length
|
||||
|
||||
while (state.position < len) {
|
||||
const start = state.position
|
||||
lexText(state)
|
||||
|
||||
if (state.position === start) {
|
||||
const isComment = startsWith(str, '!--', start + 1)
|
||||
if (isComment) lexComment(state)
|
||||
else {
|
||||
const tagName = lexTag(state)
|
||||
const safeTag = tagName.toLowerCase()
|
||||
if (childlessTags.includes(safeTag)) lexSkipTag(tagName, state)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const lexer = (str: string): Token[] => {
|
||||
const state = {
|
||||
str,
|
||||
position: 0,
|
||||
tokens: [],
|
||||
}
|
||||
lex(state)
|
||||
return state.tokens
|
||||
}
|
129
src/utils/htmlParser/parser.ts
Normal file
129
src/utils/htmlParser/parser.ts
Normal file
@ -0,0 +1,129 @@
|
||||
import { Token, HTMLNode, TagToken, NormalElement, TagEndToken, AttributeToken, TextToken } from './types'
|
||||
import { closingTags, closingTagAncestorBreakers, voidTags } from './tags'
|
||||
|
||||
interface StackItem {
|
||||
tagName: string | null;
|
||||
children: HTMLNode[];
|
||||
}
|
||||
|
||||
interface State {
|
||||
stack: StackItem[];
|
||||
cursor: number;
|
||||
tokens: Token[];
|
||||
}
|
||||
|
||||
export const parser = (tokens: Token[]) => {
|
||||
const root: StackItem = { tagName: null, children: [] }
|
||||
const state: State = { tokens, cursor: 0, stack: [root] }
|
||||
parse(state)
|
||||
return root.children
|
||||
}
|
||||
|
||||
export const hasTerminalParent = (tagName: string, stack: StackItem[]) => {
|
||||
const tagParents = closingTagAncestorBreakers[tagName]
|
||||
if (tagParents) {
|
||||
let currentIndex = stack.length - 1
|
||||
while (currentIndex >= 0) {
|
||||
const parentTagName = stack[currentIndex].tagName
|
||||
if (parentTagName === tagName) break
|
||||
if (tagParents.includes(parentTagName)) return true
|
||||
currentIndex--
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
export const rewindStack = (stack: StackItem[], newLength: number) => {
|
||||
stack.splice(newLength)
|
||||
}
|
||||
|
||||
export const parse = (state: State) => {
|
||||
const { stack, tokens } = state
|
||||
let { cursor } = state
|
||||
let nodes = stack[stack.length - 1].children
|
||||
const len = tokens.length
|
||||
|
||||
while (cursor < len) {
|
||||
const token = tokens[cursor]
|
||||
if (token.type !== 'tag-start') {
|
||||
nodes.push(token as TextToken)
|
||||
cursor++
|
||||
continue
|
||||
}
|
||||
|
||||
const tagToken = tokens[++cursor] as TagToken
|
||||
cursor++
|
||||
const tagName = tagToken.content.toLowerCase()
|
||||
if (token.close) {
|
||||
let index = stack.length
|
||||
let shouldRewind = false
|
||||
while (--index > -1) {
|
||||
if (stack[index].tagName === tagName) {
|
||||
shouldRewind = true
|
||||
break
|
||||
}
|
||||
}
|
||||
while (cursor < len) {
|
||||
if (tokens[cursor].type !== 'tag-end') break
|
||||
cursor++
|
||||
}
|
||||
if (shouldRewind) {
|
||||
rewindStack(stack, index)
|
||||
break
|
||||
}
|
||||
else continue
|
||||
}
|
||||
|
||||
const isClosingTag = closingTags.includes(tagName)
|
||||
let shouldRewindToAutoClose = isClosingTag
|
||||
if (shouldRewindToAutoClose) {
|
||||
shouldRewindToAutoClose = !hasTerminalParent(tagName, stack)
|
||||
}
|
||||
|
||||
if (shouldRewindToAutoClose) {
|
||||
let currentIndex = stack.length - 1
|
||||
while (currentIndex > 0) {
|
||||
if (tagName === stack[currentIndex].tagName) {
|
||||
rewindStack(stack, currentIndex)
|
||||
const previousIndex = currentIndex - 1
|
||||
nodes = stack[previousIndex].children
|
||||
break
|
||||
}
|
||||
currentIndex = currentIndex - 1
|
||||
}
|
||||
}
|
||||
|
||||
const attributes = []
|
||||
let tagEndToken: TagEndToken | undefined
|
||||
while (cursor < len) {
|
||||
const _token = tokens[cursor]
|
||||
if (_token.type === 'tag-end') {
|
||||
tagEndToken = _token
|
||||
break
|
||||
}
|
||||
attributes.push((_token as AttributeToken).content)
|
||||
cursor++
|
||||
}
|
||||
|
||||
if (!tagEndToken) break
|
||||
|
||||
cursor++
|
||||
const children: HTMLNode[] = []
|
||||
const elementNode: NormalElement = {
|
||||
type: 'element',
|
||||
tagName: tagToken.content,
|
||||
attributes,
|
||||
children,
|
||||
}
|
||||
nodes.push(elementNode)
|
||||
|
||||
const hasChildren = !(tagEndToken.close || voidTags.includes(tagName))
|
||||
if (hasChildren) {
|
||||
stack.push({tagName, children})
|
||||
const innerState = { tokens, cursor, stack }
|
||||
parse(innerState)
|
||||
cursor = innerState.cursor
|
||||
}
|
||||
}
|
||||
state.cursor = cursor
|
||||
}
|
28
src/utils/htmlParser/stringify.ts
Normal file
28
src/utils/htmlParser/stringify.ts
Normal file
@ -0,0 +1,28 @@
|
||||
import { AST, ElementAST, ElementAttribute } from './types'
|
||||
import { voidTags } from './tags'
|
||||
|
||||
export const formatAttributes = (attributes: ElementAttribute[]) => {
|
||||
return attributes.reduce((attrs, attribute) => {
|
||||
const { key, value } = attribute
|
||||
if (value === null) return `${attrs} ${key}`
|
||||
if (key === 'style' && !value) return ''
|
||||
|
||||
const quoteEscape = value.indexOf('\'') !== -1
|
||||
const quote = quoteEscape ? '"' : '\''
|
||||
return `${attrs} ${key}=${quote}${value}${quote}`
|
||||
}, '')
|
||||
}
|
||||
|
||||
export const toHTML = (tree: AST[]) => {
|
||||
const htmlStrings: string[] = tree.map(node => {
|
||||
if (node.type === 'text') return node.content
|
||||
if (node.type === 'comment') return `<!--${node.content}-->`
|
||||
|
||||
const { tagName, attributes, children } = node as ElementAST
|
||||
const isSelfClosing = voidTags.includes(tagName.toLowerCase())
|
||||
|
||||
if (isSelfClosing) return `<${tagName}${formatAttributes(attributes)}>`
|
||||
return `<${tagName}${formatAttributes(attributes)}>${toHTML(children)}</${tagName}>`
|
||||
})
|
||||
return htmlStrings.join('')
|
||||
}
|
16
src/utils/htmlParser/tags.ts
Normal file
16
src/utils/htmlParser/tags.ts
Normal file
@ -0,0 +1,16 @@
|
||||
export const childlessTags = ['style', 'script', 'template']
|
||||
|
||||
export const closingTags = ['html', 'head', 'body', 'p', 'dt', 'dd', 'li', 'option', 'thead', 'th', 'tbody', 'tr', 'td', 'tfoot', 'colgroup']
|
||||
|
||||
export const closingTagAncestorBreakers = {
|
||||
li: ['ul', 'ol', 'menu'],
|
||||
dt: ['dl'],
|
||||
dd: ['dl'],
|
||||
tbody: ['table'],
|
||||
thead: ['table'],
|
||||
tfoot: ['table'],
|
||||
tr: ['table'],
|
||||
td: ['table'],
|
||||
}
|
||||
|
||||
export const voidTags = ['!doctype', 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
|
69
src/utils/htmlParser/types.ts
Normal file
69
src/utils/htmlParser/types.ts
Normal file
@ -0,0 +1,69 @@
|
||||
export interface ElementAttribute {
|
||||
key: string;
|
||||
value: string | null;
|
||||
}
|
||||
|
||||
export interface CommentElement {
|
||||
type: 'comment';
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface TextElement {
|
||||
type: 'text';
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface NormalElement {
|
||||
type: 'element';
|
||||
tagName: string;
|
||||
children: HTMLNode[];
|
||||
attributes: string[];
|
||||
}
|
||||
|
||||
export type HTMLNode = CommentElement | TextElement | NormalElement
|
||||
|
||||
export interface ElementAST {
|
||||
type: 'element';
|
||||
tagName: string;
|
||||
children: AST[];
|
||||
attributes: ElementAttribute[];
|
||||
}
|
||||
|
||||
export interface CommentOrTextAST {
|
||||
type: 'comment' | 'text';
|
||||
content: string;
|
||||
}
|
||||
|
||||
export type AST = CommentOrTextAST | ElementAST
|
||||
|
||||
export interface TagStartToken {
|
||||
type: 'tag-start';
|
||||
close: boolean;
|
||||
}
|
||||
|
||||
export interface TagEndToken {
|
||||
type: 'tag-end';
|
||||
close: boolean;
|
||||
}
|
||||
|
||||
export interface TagToken {
|
||||
type: 'tag';
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface TextToken {
|
||||
type: 'text';
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface CommentToken {
|
||||
type: 'comment';
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface AttributeToken {
|
||||
type: 'attribute';
|
||||
content: string;
|
||||
}
|
||||
|
||||
export type Token = TagStartToken | TagEndToken | TagToken | TextToken | CommentToken | AttributeToken
|
Loading…
x
Reference in New Issue
Block a user