动态正则匹配

原创

北京小辉 2022-12-28 14:58:18 博主文章分类：【工具】正则 ©著作权

©著作权归作者所有：来自51CTO博客作者北京小辉的原创作品，请联系作者获取转载授权，否则将追究法律责任

需求：

1、写一个动态正则；

2、只要写出日志的Schma就可以获取到日志的正则。

package com.donews.util

import java.util.regex.Pattern

import scala.collection.mutable.ArrayBuffer

/**
  * Created by yuhui on 2016/8/5.
  */

/***
列子：       www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" China 22 Beijing
第一版本    "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" $country $region $city"

例子 ：      www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"
第二版本    "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\""

例子 ：     www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" "http://www.donews.com/media/201408/2834414.shtm" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"
第三版本    $domain $http_x_forwarded_for - $remote_user [$timestamp] "$http_url" "$url" $status $body_bytes_sent "$http_referer" "$http_user_agent" "$e_ip" "$country" "$region" "$city"
 */


object DynamicRegex{

  var cmd = ""

  var regex =""

  def tran(cmd: String): String = {
    val sb = new StringBuffer()
    sb.append("^")
    val regex = "^(\\W+)$"
    val p = Pattern.compile(regex)
    cmd.split(" ").foreach(key =>
      if (!p.matcher(key).find()) {
        key.substring(0, key.indexOf("$"))
        match {
          case "" =>
            if (key.split("\\$").length > 2) {
              var split = ""
              val regex = "(\\$\\w+)(\\W+)(\\$\\w+)(.*)"
              val p = Pattern.compile(regex)
              val m = p.matcher(key)
              while (m.find()) {
                split = m.group(2)
              }
              sb.append("(")
              for (i <- Range(0, key.split("\\$").length - 1, 1)) {
                if (i < key.split("\\$").length - 2) {
                  sb.append("[\\S]+[" + split + "]")
                } else {
                  sb.append("[\\S]+")
                }
              }
              sb.append(")\\s")
            } else {
              sb.append("([\\S]+)\\s")
            }
          case _ =>
            val regex = "(\\W+)(\\$\\w+)(\\W+)"
            val p = Pattern.compile(regex)
            val m = p.matcher(key)
            if (m.find) {
              val pre = m.group(1)
              val end = m.group(3)
              sb.append("(" + escape(pre) + ".+" + escape(end) + ")\\s")
            }
        }
      }else{
        sb.append("(\\W+)\\s")
      }
    )
    val str = sb.toString
    str.substring(0, str.length - 2).concat("$")
  }

  def escape(original: String): String = {
    val tb = new StringBuffer()
    for (i <- Range(0, original.length(), 1)) {
      if ("\"".equals(original.charAt(i).toString)) {
      } else {
        tb.append("\\")
      }
      tb.append(original.charAt(i))
    }
    tb.toString
  }

  def lineToGroup(line: String): ArrayBuffer[String] = {
    val groups = ArrayBuffer[String]()
    val p = Pattern.compile(regex)
    val m = p.matcher(line)
    while (m.find()) {
      for (i <- Range(1, m.groupCount() + 1, 1)) {
        groups.append(m.group(i))
      }
    }
    groups
  }

  def main(args: Array[String]): Unit = {

    cmd = "$domain $http_x_forwarded_for - $remote_user [$timestamp] \"$http_url\" \"$url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\""
    regex=tran(cmd)
    println(regex)
    val log = "www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] \"GET /media/201408/2834414.shtm HTTP/1.1\" \"http://www.donews.com/media/201408/2834414.shtm\" 200 11296 \"-\" \"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)\" \"-\" \"China\" \"22\" \"Beijing\""
    lineToGroup(log).foreach(x=>println(x))

  }
}

输出结果:

^([\S]+)\s([\S]+)\s(\W+)\s([\S]+)\s(\[.+\])\s(".+")\s(".+")\s([\S]+)\s([\S]+)\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")$
www.donews.com
123.125.71.72
-
-
[28/Nov/2016:11:08:50 +0800]
"GET /media/201408/2834414.shtm HTTP/1.1"
"http://www.donews.com/media/201408/2834414.shtm"
200
11296
"-"
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
"-"
"China"
"22"
"Beijing"