java js转值给后台因为特殊字符被截断

转载

mob6454cc6e8f43 2024-09-15 14:47:57

文章标签 js转义html的特殊字符 html 字符串特殊字符 文章分类 Java 后端开发

HTML-Parser

背景：需求需要把 html 字符串转成 DOM 对象树或者 js 对象树，然后进行一些处理/操作。htmlparser 这个库还行，但是对 attribute 上一些含有特殊字符的属性值转换不行，同时看了看开标签语法(syntax-start-tag:whatwg)、html-attribute 的支持规则(attributes:whatwg) 和一些其他库的实现，在一些边界场景(特殊属性值和web component)处理还是缺少，算了... 自己撸了个 html parser 的函数么好了。

本文主要是记录下实现过程，做个技术沉淀，有相关需求的可以做个参考。

前期处理

首先，定义一些正则表达式，用以匹配希望找到的内容

const ltReg = /</g
const gtReg = />/g
const sqReg = /'/g
const qReg = /"/g
const sqAttrReg = /(?<==')[^']*?(?=')/g
const qAttrReg = /(?<==")[^"]*?(?=")/g
const qRegBk = /"/g
const sqRegBk = /'/g
const ltRegBk = /</g
const gtRegBk = />/g
const attrReplaceReg = /[:wd_-]*?=(["].*?["]|['].*?['])/g
const attrReg = /(?<=s)([:wd-]+=(["'].*?["']|[wd]+)|w+)/g
const numReg = /^d+$/
const clReg = /n/g
const sReg = /s/g
const spReg = /s+/g
const tagReg = /<[^<>]*?>/
const startReg = /<[^/!].*?>/
const endReg = /</.*?>/
const commentReg = /(?<=<!--).*?(?=-->)/
const tagCheckReg = /(?<=<)[w-]+/

开始处理逻辑，拿个简单的 html 字符串做例子。

const str = `
<div id="container">
  <div class="test" data-html="<p>hello 1</p>">
    <p>hello 2</p>
    <input type="text" value="hello 3" >
  </div>
</div>
`

属性值转义

拿到字符串 str，取各个开标签，并将标签内的 attribute 里的特殊字符做转义字符替换，返回字符串 str1

const replaceAttribute = (html: string): string => {
  return html.replace(attrReplaceReg, v => {
    return v
      .replace(ltReg, '<')
      .replace(gtReg, '>')
      .replace(sqAttrReg, v => {
        return v.replace(qReg, '"')
      })
      .replace(qAttrReg, v => {
        return v.replace(sqReg, ''')
      })
  })
}

结果如下：

;`<div id="container">
  <div class="test" data-html="<p>hello 1</p>">
    <p>hello 2</p>
    <input type="text" value="hello 3" >
  </div>
</div>`

形成内容数组

从上一步的字符串 str1 中截取出元素(元素是: 开标签、内容、闭合标签)，放入新数组 arr。

const convertStringToArray = (html: string) => {
  let privateHtml = html
  let temporaryHtml = html
  const arr = []
  while (privateHtml.match(tagReg)) {
    privateHtml = temporaryHtml.replace(tagReg, (v, i) => {
      if (i > 0) {
        const value = temporaryHtml.slice(0, i)
        if (value.replace(sReg, '').length > 0) {
          arr.push(value)
        }
      }
      temporaryHtml = temporaryHtml.slice(i + v.length)
      arr.push(v)
      return ''
    })
  }
  return arr
}

结果如下：

["<div id="container">", "<div class="test" data-html="<p>hello 1</p>">", "<p>", "hello 2", "</p>", "<input type="text" value="hello 3" >", "</div>", "</div>"]

生成对象树

循环上一步形成的 arr，处理成对象树

// 单标签集合
var singleTags = [
  'img',
  'input',
  'br',
  'hr',
  'meta',
  'link',
  'param',
  'base',
  'basefont',
  'area',
  'source',
  'track',
  'embed'
]

// 处理不可见字符
const toOneLine = str => str.replace(sReg, '')

const makeUpTree = arr => {
  // DomUtil 是根据 nodejs 还是 browser 环境生成 js 对象/ dom 对象的函数（自定义）
  const root = DomUtil('container')
  let deep = 0
  const parentElements = [root]
  arr.forEach(function(i) {
    const parentElement = parentElements[parentElements.length - 1]
    // 这里的处理仅为了做判断标签用，避免特殊字符的影响
    const inlineI = toOneLine(i)
    if (parentElement) {
      // 开标签处理，新增个开标签标记
      if (startReg.test(inlineI)) {
        deep++
        const tagName = i.match(tagCheckReg)
        if (!tagName) {
          throw Error('标签规范错误')
        }
        const element_1 = DomUtil(tagName[0])
        const attrs = matchAttr(i)
        attrs.forEach(function(attr) {
          if (element_1) {
            if (attr[1]) {
              element_1.setAttribute(attr[0], attr[1])
            } else {
              element_1.setAttribute(attr[0], '')
            }
          }
        })
        parentElement.appendChild(element_1)
        // 单标签处理，deep--，完成一次闭合标记
        if (
          singleTags.indexOf(tagName[0]) > -1 ||
          i.charAt(i.length - 2) === '/'
        ) {
          deep--
        } else {
          parentElements.push(element_1)
        }
      }
      // 闭合标签处理
      else if (endReg.test(inlineI)) {
        deep--
        parentElements.pop()
      } else if (commentReg.test(inlineI)) {
        const matchValue = i.match(commentReg)
        const comment = matchValue ? matchValue[0] : ''
        deep++
        const element = DomUtil('comment', comment)
        parentElement.appendChild(element)
        deep--
      } else {
        deep++
        const textElement = DomUtil('text', i)
        parentElement.appendChild(textElement)
        deep--
      }
    }
  })
  if (deep < 0) {
    throw Error('存在多余闭合标签')
  } else if (deep > 0) {
    throw Error('存在多余开标签')
  }
  return root.children
}

`DomUtil` 相关：

// nodejs 端自定义对象
class CustomDom {
  constructor(tag, data) {
    this.tagName = tag
    this.data = data
    this.attrs = {}
    this.children = []
  }
  setAttribute(key, value) {
    this.attrs[key] = value
  }
  appendChild(d) {
    this.children.push(d)
  }
}
// 根据环境返回对应的对象
const DomUtil = (tag, v) => {
  if (typeof window !== 'undefined' && window.document) {
    switch (tag) {
      case 'comment':
        return document.createComment(v)
      case 'container':
        return document.createElement('div')
      case 'text':
        return document.createTextNode(v)
      default:
        return document.createElement(tag)
    }
  } else {
    return new CustomDom(tag, v)
  }
}

结果如下：

;[
  {
    attrs: {
      id: 'container'
    },
    children: [
      {
        attrs: {
          class: 'test',
          'data-html': '<p>hello 1</p>'
        },
        children: [
          {
            attrs: {},
            children: [
              {
                attrs: {},
                children: [],
                tagName: 'text',
                data: 'hello 2'
              }
            ],
            tagName: 'p'
          },
          {
            attrs: {
              type: 'text',
              value: 'hello 3'
            },
            children: [],
            tagName: 'input'
          }
        ],
        tagName: 'div'
      }
    ],
    tagName: 'div'
  }
]

组合

组合以上的 3 个步骤

const Parser = (html: string) => {
  // 特殊字符处理
  const htmlAfterAttrsReplace = replaceAttribute(html)
  // 字符串拆分标签和内容，组成数组
  const stringArray = convertStringToArray(htmlAfterAttrsReplace)
  // 数组转换成对象树
  const domTree = makeUpTree(stringArray)
  return domTree
}