feat: 添加HTML解析方法

2025-04-15 02:20:00 +08:00 · 2021-06-04 17:13:06 +08:00 · 2021-06-04 17:13:06 +08:00 · a10704c0ba
commit a10704c0ba
parent 07c3b74ce3
7 changed files with 580 additions and 0 deletions
--- a/src/utils/htmlParser/format.ts
+++ b/src/utils/htmlParser/format.ts
@ -0,0 +1,47 @@
+import { HTMLNode, CommentOrTextAST, ElementAST, AST } from './types'
+
+export const splitHead = (str: string, sep: string) => {
+  const idx = str.indexOf(sep)
+  if (idx === -1) return [str]
+  return [str.slice(0, idx), str.slice(idx + sep.length)]
+}
+
+const unquote = (str: string) => {
+  const car = str.charAt(0)
+  const end = str.length - 1
+  const isQuoteStart = car === '"' || car === "'"
+  if (isQuoteStart && car === str.charAt(end)) {
+    return str.slice(1, end)
+  }
+  return str
+}
+
+const formatAttributes = (attributes: string[]) => {
+  return attributes.map(attribute => {
+    const parts = splitHead(attribute.trim(), '=')
+    const key = parts[0]
+    const value = typeof parts[1] === 'string' ? unquote(parts[1]) : null
+    return { key, value }
+  })
+}
+
+export const format = (nodes: HTMLNode[]): AST[] => {
+  return nodes.map(node => {   
+    if (node.type === 'element') {
+      const children = format(node.children)
+      const item: ElementAST = {
+        type: 'element',
+        tagName: node.tagName.toLowerCase(),
+        attributes: formatAttributes(node.attributes),
+        children,
+      }
+      return item
+    }
+
+    const item: CommentOrTextAST = {
+      type: node.type,
+      content: node.content,
+    }
+    return item
+  })
+}
--- a/src/utils/htmlParser/index.ts
+++ b/src/utils/htmlParser/index.ts
@ -0,0 +1,15 @@
+// 参考：https://github.com/andrejewski/himalaya 用TypeScript重写并简化部分功能
+
+import { lexer } from './lexer'
+import { parser } from './parser'
+import { format } from './format'
+import { toHTML } from './stringify'
+import type { AST } from './types'
+
+export const toAST = (str: string) => {
+  const tokens = lexer(str)
+  const nodes = parser(tokens)
+  return format(nodes)
+}
+
+export { toHTML, AST }
--- a/src/utils/htmlParser/lexer.ts
+++ b/src/utils/htmlParser/lexer.ts
@ -0,0 +1,276 @@
+import startsWith from 'lodash/startsWith'
+import endsWith from 'lodash/endsWith'
+import { Token } from './types'
+import { childlessTags } from './tags'
+
+interface State {
+  str: string;
+  position: number;
+  tokens: Token[];
+}
+
+const jumpPosition = (state: State, end: number) => {
+  const len = end - state.position
+  movePositopn(state, len)
+}
+
+const movePositopn = (state: State, len: number) => {
+  state.position = state.position + len
+}
+
+const findTextEnd = (str: string, index: number) => {
+  const isEnd = false
+  while (!isEnd) {
+    const textEnd = str.indexOf('<', index)
+    if (textEnd === -1) {
+      return textEnd
+    }
+    const char = str.charAt(textEnd + 1)
+    if (char === '/' || char === '!' || /[A-Za-z0-9]/.test(char)) {
+      return textEnd
+    }
+    index = textEnd + 1
+  }
+  return -1
+}
+
+const lexText = (state: State) => {
+  const { str } = state
+  let textEnd = findTextEnd(str, state.position)
+  if (textEnd === state.position) return
+  if (textEnd === -1) {
+    textEnd = str.length
+  }
+
+  const content = str.slice(state.position, textEnd)
+  jumpPosition(state, textEnd)
+
+  state.tokens.push({
+    type: 'text', 
+    content, 
+  })
+}
+
+const lexComment = (state: State) => {
+  const { str } = state
+
+  movePositopn(state, 4)
+  let contentEnd = str.indexOf('-->', state.position)
+  let commentEnd = contentEnd + 3
+  if (contentEnd === -1) {
+    contentEnd = commentEnd = str.length
+  }
+
+  const content = str.slice(state.position, contentEnd)
+  jumpPosition(state, commentEnd)
+
+  state.tokens.push({
+    type: 'comment',
+    content,
+  })
+}
+
+const lexTagName = (state: State) => {
+  const { str } = state
+  const len = str.length
+  let start = state.position
+
+  while (start < len) {
+    const char = str.charAt(start)
+    const isTagChar = !(/\s/.test(char) || char === '/' || char === '>')
+    if (isTagChar) break
+    start++
+  }
+
+  let end = start + 1
+  while (end < len) {
+    const char = str.charAt(end)
+    const isTagChar = !(/\s/.test(char) || char === '/' || char === '>')
+    if (!isTagChar) break
+    end++
+  }
+
+  jumpPosition(state, end)
+  const tagName = str.slice(start, end)
+  state.tokens.push({
+    type: 'tag',
+    content: tagName
+  })
+  return tagName
+}
+
+const lexTagAttributes = (state: State) => {
+  const { str, tokens } = state
+  let cursor = state.position
+  let quote = null
+  let wordBegin = cursor
+  const words = []
+  const len = str.length
+  while (cursor < len) {
+    const char = str.charAt(cursor)
+    if (quote) {
+      const isQuoteEnd = char === quote
+      if (isQuoteEnd) quote = null
+      cursor++
+      continue
+    }
+
+    const isTagEnd = char === '/' || char === '>'
+    if (isTagEnd) {
+      if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor))
+      break
+    }
+
+    const isWordEnd = /\s/.test(char)
+    if (isWordEnd) {
+      if (cursor !== wordBegin) words.push(str.slice(wordBegin, cursor))
+      wordBegin = cursor + 1
+      cursor++
+      continue
+    }
+
+    const isQuoteStart = char === '\'' || char === '"'
+    if (isQuoteStart) {
+      quote = char
+      cursor++
+      continue
+    }
+
+    cursor++
+  }
+  jumpPosition(state, cursor)
+
+  const type = 'attribute'
+  for (let i = 0; i < words.length; i++) {
+    const word = words[i]
+
+    const isNotPair = word.indexOf('=') === -1
+    if (isNotPair) {
+      const secondWord = words[i + 1]
+      if (secondWord && startsWith(secondWord, '=')) {
+        if (secondWord.length > 1) {
+          const newWord = word + secondWord
+          tokens.push({ type, content: newWord })
+          i += 1
+          continue
+        }
+        const thirdWord = words[i + 2]
+        i += 1
+        if (thirdWord) {
+          const newWord = word + '=' + thirdWord
+          tokens.push({ type, content: newWord })
+          i += 1
+          continue
+        }
+      }
+    }
+    if (endsWith(word, '=')) {
+      const secondWord = words[i + 1]
+      if (secondWord && secondWord.indexOf('=') === -1) {
+        const newWord = word + secondWord
+        tokens.push({ type, content: newWord })
+        i += 1
+        continue
+      }
+
+      const newWord = word.slice(0, -1)
+      tokens.push({ type, content: newWord })
+      continue
+    }
+
+    tokens.push({ type, content: word })
+  }
+}
+
+const lexSkipTag = (tagName: string, state: State) => {
+  const { str, tokens } = state
+  const safeTagName = tagName.toLowerCase()
+  const len = str.length
+  let index = state.position
+  
+  while (index < len) {
+    const nextTag = str.indexOf('</', index)
+    if (nextTag === -1) {
+      lexText(state)
+      break
+    }
+
+    const tagState = {
+      str,
+      position: state.position,
+      tokens: [],
+    }
+    jumpPosition(tagState, nextTag)
+    const name = lexTag(tagState)
+    if (safeTagName !== name.toLowerCase()) {
+      index = tagState.position
+      continue
+    }
+
+    if (nextTag !== state.position) {
+      const textStart = state.position
+      jumpPosition(state, nextTag)
+      tokens.push({
+        type: 'text',
+        content: str.slice(textStart, nextTag),
+      })
+    }
+
+    tokens.push(...tagState.tokens)
+    jumpPosition(state, tagState.position)
+    break
+  }
+}
+
+const lexTag = (state: State) => {
+  const { str } = state
+  const secondChar = str.charAt(state.position + 1)
+  const tagStartClose = secondChar === '/'
+  movePositopn(state, tagStartClose ? 2 : 1)
+  state.tokens.push({
+    type: 'tag-start',
+    close: tagStartClose,
+  })
+
+  const tagName = lexTagName(state)
+  lexTagAttributes(state)
+
+  const firstChar = str.charAt(state.position)
+  const tagEndClose = firstChar === '/'
+  movePositopn(state, tagEndClose ? 2 : 1)
+  state.tokens.push({
+    type: 'tag-end',
+    close: tagEndClose,
+  })
+  return tagName
+}
+
+const lex = (state: State) => {
+  const str = state.str
+  const len = str.length
+
+  while (state.position < len) {
+    const start = state.position
+    lexText(state)
+
+    if (state.position === start) {
+      const isComment = startsWith(str, '!--', start + 1)
+      if (isComment) lexComment(state)
+      else {
+        const tagName = lexTag(state)
+        const safeTag = tagName.toLowerCase()
+        if (childlessTags.includes(safeTag)) lexSkipTag(tagName, state)
+      }
+    }
+  }
+}
+
+export const lexer = (str: string): Token[] => {
+  const state = {
+    str,
+    position: 0,
+    tokens: [],
+  }
+  lex(state)
+  return state.tokens
+}
--- a/src/utils/htmlParser/parser.ts
+++ b/src/utils/htmlParser/parser.ts
@ -0,0 +1,129 @@
+import { Token, HTMLNode, TagToken, NormalElement, TagEndToken, AttributeToken, TextToken } from './types'
+import { closingTags, closingTagAncestorBreakers, voidTags } from './tags'
+
+interface StackItem {
+  tagName: string | null;
+  children: HTMLNode[];
+}
+
+interface State {
+  stack: StackItem[];
+  cursor: number;
+  tokens: Token[];
+}
+
+export const parser = (tokens: Token[]) => {
+  const root: StackItem = { tagName: null, children: [] }
+  const state: State = { tokens, cursor: 0, stack: [root] }
+  parse(state)
+  return root.children
+}
+
+export const hasTerminalParent = (tagName: string, stack: StackItem[]) => {
+  const tagParents = closingTagAncestorBreakers[tagName]
+  if (tagParents) {
+    let currentIndex = stack.length - 1
+    while (currentIndex >= 0) {
+      const parentTagName = stack[currentIndex].tagName
+      if (parentTagName === tagName) break
+      if (tagParents.includes(parentTagName)) return true
+      currentIndex--
+    }
+  }
+  return false
+}
+
+export const rewindStack = (stack: StackItem[], newLength: number) => {
+  stack.splice(newLength)
+}
+
+export const parse = (state: State) => {
+  const { stack, tokens } = state
+  let { cursor } = state
+  let nodes = stack[stack.length - 1].children
+  const len = tokens.length
+  
+  while (cursor < len) {
+    const token = tokens[cursor]
+    if (token.type !== 'tag-start') {
+      nodes.push(token as TextToken)
+      cursor++
+      continue
+    }
+
+    const tagToken = tokens[++cursor] as TagToken
+    cursor++
+    const tagName = tagToken.content.toLowerCase()
+    if (token.close) {
+      let index = stack.length
+      let shouldRewind = false
+      while (--index > -1) {
+        if (stack[index].tagName === tagName) {
+          shouldRewind = true
+          break
+        }
+      }
+      while (cursor < len) {
+        if (tokens[cursor].type !== 'tag-end') break
+        cursor++
+      }
+      if (shouldRewind) {
+        rewindStack(stack, index)
+        break
+      } 
+      else continue
+    }
+
+    const isClosingTag = closingTags.includes(tagName)
+    let shouldRewindToAutoClose = isClosingTag
+    if (shouldRewindToAutoClose) {
+      shouldRewindToAutoClose = !hasTerminalParent(tagName, stack)
+    }
+
+    if (shouldRewindToAutoClose) {
+      let currentIndex = stack.length - 1
+      while (currentIndex > 0) {
+        if (tagName === stack[currentIndex].tagName) {
+          rewindStack(stack, currentIndex)
+          const previousIndex = currentIndex - 1
+          nodes = stack[previousIndex].children
+          break
+        }
+        currentIndex = currentIndex - 1
+      }
+    }
+
+    const attributes = []
+    let tagEndToken: TagEndToken | undefined
+    while (cursor < len) {
+      const _token = tokens[cursor]
+      if (_token.type === 'tag-end') {
+        tagEndToken = _token
+        break
+      }
+      attributes.push((_token as AttributeToken).content)
+      cursor++
+    }
+
+    if (!tagEndToken) break
+
+    cursor++
+    const children: HTMLNode[] = []
+    const elementNode: NormalElement = {
+      type: 'element',
+      tagName: tagToken.content,
+      attributes,
+      children,
+    }
+    nodes.push(elementNode)
+
+    const hasChildren = !(tagEndToken.close || voidTags.includes(tagName))
+    if (hasChildren) {
+      stack.push({tagName, children})
+      const innerState = { tokens, cursor, stack }
+      parse(innerState)
+      cursor = innerState.cursor
+    }
+  }
+  state.cursor = cursor
+}
--- a/src/utils/htmlParser/stringify.ts
+++ b/src/utils/htmlParser/stringify.ts
@ -0,0 +1,28 @@
+import { AST, ElementAST, ElementAttribute } from './types'
+import { voidTags } from './tags'
+
+export const formatAttributes = (attributes: ElementAttribute[]) => {
+  return attributes.reduce((attrs, attribute) => {
+    const { key, value } = attribute
+    if (value === null) return `${attrs} ${key}`
+    if (key === 'style' && !value) return ''
+
+    const quoteEscape = value.indexOf('\'') !== -1
+    const quote = quoteEscape ? '"' : '\''
+    return `${attrs} ${key}=${quote}${value}${quote}`
+  }, '')
+}
+
+export const toHTML = (tree: AST[]) => {
+  const htmlStrings: string[] = tree.map(node => {
+    if (node.type === 'text') return node.content
+    if (node.type === 'comment') return `<!--${node.content}-->`
+
+    const { tagName, attributes, children } = node as ElementAST
+    const isSelfClosing = voidTags.includes(tagName.toLowerCase())
+
+    if (isSelfClosing) return `<${tagName}${formatAttributes(attributes)}>`
+    return `<${tagName}${formatAttributes(attributes)}>${toHTML(children)}</${tagName}>`
+  })
+  return htmlStrings.join('')
+}
--- a/src/utils/htmlParser/tags.ts
+++ b/src/utils/htmlParser/tags.ts
@ -0,0 +1,16 @@
+export const childlessTags = ['style', 'script', 'template']
+
+export const closingTags = ['html', 'head', 'body', 'p', 'dt', 'dd', 'li', 'option', 'thead', 'th', 'tbody', 'tr', 'td', 'tfoot', 'colgroup']
+
+export const closingTagAncestorBreakers = {
+  li: ['ul', 'ol', 'menu'],
+  dt: ['dl'],
+  dd: ['dl'],
+  tbody: ['table'],
+  thead: ['table'],
+  tfoot: ['table'],
+  tr: ['table'],
+  td: ['table'],
+}
+
+export const voidTags = ['!doctype', 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
--- a/src/utils/htmlParser/types.ts
+++ b/src/utils/htmlParser/types.ts
@ -0,0 +1,69 @@
+export interface ElementAttribute {
+  key: string;
+  value: string | null;
+}
+
+export interface CommentElement {
+  type: 'comment';
+  content: string;
+}
+
+export interface TextElement {
+  type: 'text';
+  content: string;
+}
+
+export interface NormalElement {
+  type: 'element';
+  tagName: string;
+  children: HTMLNode[];
+  attributes: string[];
+}
+
+export type HTMLNode = CommentElement | TextElement | NormalElement
+
+export interface ElementAST {
+  type: 'element';
+  tagName: string;
+  children: AST[];
+  attributes: ElementAttribute[];
+}
+
+export interface CommentOrTextAST {
+  type: 'comment' | 'text';
+  content: string;
+}
+
+export type AST = CommentOrTextAST | ElementAST
+
+export interface TagStartToken {
+  type: 'tag-start';
+  close: boolean;
+}
+
+export interface TagEndToken {
+  type: 'tag-end';
+  close: boolean;
+}
+
+export interface TagToken {
+  type: 'tag';
+  content: string;
+}
+
+export interface TextToken {
+  type: 'text';
+  content: string;
+}
+
+export interface CommentToken {
+  type: 'comment';
+  content: string;
+}
+
+export interface AttributeToken {
+  type: 'attribute';
+  content: string;
+}
+
+export type Token = TagStartToken | TagEndToken | TagToken | TextToken | CommentToken | AttributeToken