From a10704c0ba7834c1ee6f2f366e33ac0baac57009 Mon Sep 17 00:00:00 2001 From: pipipi-pikachu Date: Fri, 4 Jun 2021 17:13:06 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0HTML=E8=A7=A3?= =?UTF-8?q?=E6=9E=90=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/utils/htmlParser/format.ts | 47 +++++ src/utils/htmlParser/index.ts | 15 ++ src/utils/htmlParser/lexer.ts | 276 ++++++++++++++++++++++++++++++ src/utils/htmlParser/parser.ts | 129 ++++++++++++++ src/utils/htmlParser/stringify.ts | 28 +++ src/utils/htmlParser/tags.ts | 16 ++ src/utils/htmlParser/types.ts | 69 ++++++++ 7 files changed, 580 insertions(+) create mode 100644 src/utils/htmlParser/format.ts create mode 100644 src/utils/htmlParser/index.ts create mode 100644 src/utils/htmlParser/lexer.ts create mode 100644 src/utils/htmlParser/parser.ts create mode 100644 src/utils/htmlParser/stringify.ts create mode 100644 src/utils/htmlParser/tags.ts create mode 100644 src/utils/htmlParser/types.ts diff --git a/src/utils/htmlParser/format.ts b/src/utils/htmlParser/format.ts new file mode 100644 index 00000000..2b409903 --- /dev/null +++ b/src/utils/htmlParser/format.ts @@ -0,0 +1,47 @@ +import { HTMLNode, CommentOrTextAST, ElementAST, AST } from './types' + +export const splitHead = (str: string, sep: string) => { + const idx = str.indexOf(sep) + if (idx === -1) return [str] + return [str.slice(0, idx), str.slice(idx + sep.length)] +} + +const unquote = (str: string) => { + const car = str.charAt(0) + const end = str.length - 1 + const isQuoteStart = car === '"' || car === "'" + if (isQuoteStart && car === str.charAt(end)) { + return str.slice(1, end) + } + return str +} + +const formatAttributes = (attributes: string[]) => { + return attributes.map(attribute => { + const parts = splitHead(attribute.trim(), '=') + const key = parts[0] + const value = typeof parts[1] === 'string' ? unquote(parts[1]) : null + return { key, value } + }) +} + +export const format = (nodes: HTMLNode[]): AST[] => { + return nodes.map(node => { + if (node.type === 'element') { + const children = format(node.children) + const item: ElementAST = { + type: 'element', + tagName: node.tagName.toLowerCase(), + attributes: formatAttributes(node.attributes), + children, + } + return item + } + + const item: CommentOrTextAST = { + type: node.type, + content: node.content, + } + return item + }) +} \ No newline at end of file diff --git a/src/utils/htmlParser/index.ts b/src/utils/htmlParser/index.ts new file mode 100644 index 00000000..9852a72f --- /dev/null +++ b/src/utils/htmlParser/index.ts @@ -0,0 +1,15 @@ +// 参考:https://github.com/andrejewski/himalaya 用TypeScript重写并简化部分功能 + +import { lexer } from './lexer' +import { parser } from './parser' +import { format } from './format' +import { toHTML } from './stringify' +import type { AST } from './types' + +export const toAST = (str: string) => { + const tokens = lexer(str) + const nodes = parser(tokens) + return format(nodes) +} + +export { toHTML, AST } \ No newline at end of file diff --git a/src/utils/htmlParser/lexer.ts b/src/utils/htmlParser/lexer.ts new file mode 100644 index 00000000..a80175ba --- /dev/null +++ b/src/utils/htmlParser/lexer.ts @@ -0,0 +1,276 @@ +import startsWith from 'lodash/startsWith' +import endsWith from 'lodash/endsWith' +import { Token } from './types' +import { childlessTags } from './tags' + +interface State { + str: string; + position: number; + tokens: Token[]; +} + +const jumpPosition = (state: State, end: number) => { + const len = end - state.position + movePositopn(state, len) +} + +const movePositopn = (state: State, len: number) => { + state.position = state.position + len +} + +const findTextEnd = (str: string, index: number) => { + const isEnd = false + while (!isEnd) { + const textEnd = str.indexOf('<', index) + if (textEnd === -1) { + return textEnd + } + const char = str.charAt(textEnd + 1) + if (char === '/' || char === '!' || /[A-Za-z0-9]/.test(char)) { + return textEnd + } + index = textEnd + 1 + } + return -1 +} + +const lexText = (state: State) => { + const { str } = state + let textEnd = findTextEnd(str, state.position) + if (textEnd === state.position) return + if (textEnd === -1) { + textEnd = str.length + } + + const content = str.slice(state.position, textEnd) + jumpPosition(state, textEnd) + + state.tokens.push({ + type: 'text', + content, + }) +} + +const lexComment = (state: State) => { + const { str } = state + + movePositopn(state, 4) + let contentEnd = str.indexOf('-->', state.position) + let commentEnd = contentEnd + 3 + if (contentEnd === -1) { + contentEnd = commentEnd = str.length + } + + const content = str.slice(state.position, contentEnd) + jumpPosition(state, commentEnd) + + state.tokens.push({ + type: 'comment', + content, + }) +} + +const lexTagName = (state: State) => { + const { str } = state + const len = str.length + let start = state.position + + while (start < len) { + const char = str.charAt(start) + const isTagChar = !(/\s/.test(char) || char === '/' || char === '>') + if (isTagChar) break + start++ + } + + let end = start + 1 + while (end < len) { + const char = str.charAt(end) + const isTagChar = !(/\s/.test(char) || char === '/' || char === '>') + if (!isTagChar) break + end++ + } + + jumpPosition(state, end) + const tagName = str.slice(start, end) + state.tokens.push({ + type: 'tag', + content: tagName + }) + return tagName +} + +const lexTagAttributes = (state: State) => { + const { str, tokens } = state + let cursor = state.position + let quote = null + let wordBegin = cursor + const words = [] + const len = str.length + while (cursor < len) { + const char = str.charAt(cursor) + if (quote) { + const isQuoteEnd = char === quote + if (isQuoteEnd) quote = null + cursor++ + continue + } + + const isTagEnd = char === '/' || char === '>' + if (isTagEnd) { + if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor)) + break + } + + const isWordEnd = /\s/.test(char) + if (isWordEnd) { + if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor)) + wordBegin = cursor + 1 + cursor++ + continue + } + + const isQuoteStart = char === '\'' || char === '"' + if (isQuoteStart) { + quote = char + cursor++ + continue + } + + cursor++ + } + jumpPosition(state, cursor) + + const type = 'attribute' + for (let i = 0; i < words.length; i++) { + const word = words[i] + + const isNotPair = word.indexOf('=') === -1 + if (isNotPair) { + const secondWord = words[i + 1] + if (secondWord && startsWith(secondWord, '=')) { + if (secondWord.length > 1) { + const newWord = word + secondWord + tokens.push({ type, content: newWord }) + i += 1 + continue + } + const thirdWord = words[i + 2] + i += 1 + if (thirdWord) { + const newWord = word + '=' + thirdWord + tokens.push({ type, content: newWord }) + i += 1 + continue + } + } + } + if (endsWith(word, '=')) { + const secondWord = words[i + 1] + if (secondWord && secondWord.indexOf('=') === -1) { + const newWord = word + secondWord + tokens.push({ type, content: newWord }) + i += 1 + continue + } + + const newWord = word.slice(0, -1) + tokens.push({ type, content: newWord }) + continue + } + + tokens.push({ type, content: word }) + } +} + +const lexSkipTag = (tagName: string, state: State) => { + const { str, tokens } = state + const safeTagName = tagName.toLowerCase() + const len = str.length + let index = state.position + + while (index < len) { + const nextTag = str.indexOf(' { + const { str } = state + const secondChar = str.charAt(state.position + 1) + const tagStartClose = secondChar === '/' + movePositopn(state, tagStartClose ? 2 : 1) + state.tokens.push({ + type: 'tag-start', + close: tagStartClose, + }) + + const tagName = lexTagName(state) + lexTagAttributes(state) + + const firstChar = str.charAt(state.position) + const tagEndClose = firstChar === '/' + movePositopn(state, tagEndClose ? 2 : 1) + state.tokens.push({ + type: 'tag-end', + close: tagEndClose, + }) + return tagName +} + +const lex = (state: State) => { + const str = state.str + const len = str.length + + while (state.position < len) { + const start = state.position + lexText(state) + + if (state.position === start) { + const isComment = startsWith(str, '!--', start + 1) + if (isComment) lexComment(state) + else { + const tagName = lexTag(state) + const safeTag = tagName.toLowerCase() + if (childlessTags.includes(safeTag)) lexSkipTag(tagName, state) + } + } + } +} + +export const lexer = (str: string): Token[] => { + const state = { + str, + position: 0, + tokens: [], + } + lex(state) + return state.tokens +} \ No newline at end of file diff --git a/src/utils/htmlParser/parser.ts b/src/utils/htmlParser/parser.ts new file mode 100644 index 00000000..6c835627 --- /dev/null +++ b/src/utils/htmlParser/parser.ts @@ -0,0 +1,129 @@ +import { Token, HTMLNode, TagToken, NormalElement, TagEndToken, AttributeToken, TextToken } from './types' +import { closingTags, closingTagAncestorBreakers, voidTags } from './tags' + +interface StackItem { + tagName: string | null; + children: HTMLNode[]; +} + +interface State { + stack: StackItem[]; + cursor: number; + tokens: Token[]; +} + +export const parser = (tokens: Token[]) => { + const root: StackItem = { tagName: null, children: [] } + const state: State = { tokens, cursor: 0, stack: [root] } + parse(state) + return root.children +} + +export const hasTerminalParent = (tagName: string, stack: StackItem[]) => { + const tagParents = closingTagAncestorBreakers[tagName] + if (tagParents) { + let currentIndex = stack.length - 1 + while (currentIndex >= 0) { + const parentTagName = stack[currentIndex].tagName + if (parentTagName === tagName) break + if (tagParents.includes(parentTagName)) return true + currentIndex-- + } + } + return false +} + +export const rewindStack = (stack: StackItem[], newLength: number) => { + stack.splice(newLength) +} + +export const parse = (state: State) => { + const { stack, tokens } = state + let { cursor } = state + let nodes = stack[stack.length - 1].children + const len = tokens.length + + while (cursor < len) { + const token = tokens[cursor] + if (token.type !== 'tag-start') { + nodes.push(token as TextToken) + cursor++ + continue + } + + const tagToken = tokens[++cursor] as TagToken + cursor++ + const tagName = tagToken.content.toLowerCase() + if (token.close) { + let index = stack.length + let shouldRewind = false + while (--index > -1) { + if (stack[index].tagName === tagName) { + shouldRewind = true + break + } + } + while (cursor < len) { + if (tokens[cursor].type !== 'tag-end') break + cursor++ + } + if (shouldRewind) { + rewindStack(stack, index) + break + } + else continue + } + + const isClosingTag = closingTags.includes(tagName) + let shouldRewindToAutoClose = isClosingTag + if (shouldRewindToAutoClose) { + shouldRewindToAutoClose = !hasTerminalParent(tagName, stack) + } + + if (shouldRewindToAutoClose) { + let currentIndex = stack.length - 1 + while (currentIndex > 0) { + if (tagName === stack[currentIndex].tagName) { + rewindStack(stack, currentIndex) + const previousIndex = currentIndex - 1 + nodes = stack[previousIndex].children + break + } + currentIndex = currentIndex - 1 + } + } + + const attributes = [] + let tagEndToken: TagEndToken | undefined + while (cursor < len) { + const _token = tokens[cursor] + if (_token.type === 'tag-end') { + tagEndToken = _token + break + } + attributes.push((_token as AttributeToken).content) + cursor++ + } + + if (!tagEndToken) break + + cursor++ + const children: HTMLNode[] = [] + const elementNode: NormalElement = { + type: 'element', + tagName: tagToken.content, + attributes, + children, + } + nodes.push(elementNode) + + const hasChildren = !(tagEndToken.close || voidTags.includes(tagName)) + if (hasChildren) { + stack.push({tagName, children}) + const innerState = { tokens, cursor, stack } + parse(innerState) + cursor = innerState.cursor + } + } + state.cursor = cursor +} \ No newline at end of file diff --git a/src/utils/htmlParser/stringify.ts b/src/utils/htmlParser/stringify.ts new file mode 100644 index 00000000..82ec2aa5 --- /dev/null +++ b/src/utils/htmlParser/stringify.ts @@ -0,0 +1,28 @@ +import { AST, ElementAST, ElementAttribute } from './types' +import { voidTags } from './tags' + +export const formatAttributes = (attributes: ElementAttribute[]) => { + return attributes.reduce((attrs, attribute) => { + const { key, value } = attribute + if (value === null) return `${attrs} ${key}` + if (key === 'style' && !value) return '' + + const quoteEscape = value.indexOf('\'') !== -1 + const quote = quoteEscape ? '"' : '\'' + return `${attrs} ${key}=${quote}${value}${quote}` + }, '') +} + +export const toHTML = (tree: AST[]) => { + const htmlStrings: string[] = tree.map(node => { + if (node.type === 'text') return node.content + if (node.type === 'comment') return `` + + const { tagName, attributes, children } = node as ElementAST + const isSelfClosing = voidTags.includes(tagName.toLowerCase()) + + if (isSelfClosing) return `<${tagName}${formatAttributes(attributes)}>` + return `<${tagName}${formatAttributes(attributes)}>${toHTML(children)}` + }) + return htmlStrings.join('') +} \ No newline at end of file diff --git a/src/utils/htmlParser/tags.ts b/src/utils/htmlParser/tags.ts new file mode 100644 index 00000000..d0bff44a --- /dev/null +++ b/src/utils/htmlParser/tags.ts @@ -0,0 +1,16 @@ +export const childlessTags = ['style', 'script', 'template'] + +export const closingTags = ['html', 'head', 'body', 'p', 'dt', 'dd', 'li', 'option', 'thead', 'th', 'tbody', 'tr', 'td', 'tfoot', 'colgroup'] + +export const closingTagAncestorBreakers = { + li: ['ul', 'ol', 'menu'], + dt: ['dl'], + dd: ['dl'], + tbody: ['table'], + thead: ['table'], + tfoot: ['table'], + tr: ['table'], + td: ['table'], +} + +export const voidTags = ['!doctype', 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'] \ No newline at end of file diff --git a/src/utils/htmlParser/types.ts b/src/utils/htmlParser/types.ts new file mode 100644 index 00000000..93f198f7 --- /dev/null +++ b/src/utils/htmlParser/types.ts @@ -0,0 +1,69 @@ +export interface ElementAttribute { + key: string; + value: string | null; +} + +export interface CommentElement { + type: 'comment'; + content: string; +} + +export interface TextElement { + type: 'text'; + content: string; +} + +export interface NormalElement { + type: 'element'; + tagName: string; + children: HTMLNode[]; + attributes: string[]; +} + +export type HTMLNode = CommentElement | TextElement | NormalElement + +export interface ElementAST { + type: 'element'; + tagName: string; + children: AST[]; + attributes: ElementAttribute[]; +} + +export interface CommentOrTextAST { + type: 'comment' | 'text'; + content: string; +} + +export type AST = CommentOrTextAST | ElementAST + +export interface TagStartToken { + type: 'tag-start'; + close: boolean; +} + +export interface TagEndToken { + type: 'tag-end'; + close: boolean; +} + +export interface TagToken { + type: 'tag'; + content: string; +} + +export interface TextToken { + type: 'text'; + content: string; +} + +export interface CommentToken { + type: 'comment'; + content: string; +} + +export interface AttributeToken { + type: 'attribute'; + content: string; +} + +export type Token = TagStartToken | TagEndToken | TagToken | TextToken | CommentToken | AttributeToken