需求:

1、写一个动态正则;

2、只要写出日志的Schma就可以获取到日志的正则。

package com.donews.util

import java.util.regex.Pattern

import scala.collection.mutable.ArrayBuffer

/**
* Created by yuhui on 2016/8/5.
*/

/***
列子: www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" China 22 Beijing
第一版本 "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" $country $region $city"

例子 : www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"
第二版本 "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\""

例子 : www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" "http://www.donews.com/media/201408/2834414.shtm" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"
第三版本 $domain $http_x_forwarded_for - $remote_user [$timestamp] "$http_url" "$url" $status $body_bytes_sent "$http_referer" "$http_user_agent" "$e_ip" "$country" "$region" "$city"
*/


object DynamicRegex{

var cmd = ""

var regex =""

def tran(cmd: String): String = {
val sb = new StringBuffer()
sb.append("^")
val regex = "^(\\W+)$"
val p = Pattern.compile(regex)
cmd.split(" ").foreach(key =>
if (!p.matcher(key).find()) {
key.substring(0, key.indexOf("$"))
match {
case "" =>
if (key.split("\\$").length > 2) {
var split = ""
val regex = "(\\$\\w+)(\\W+)(\\$\\w+)(.*)"
val p = Pattern.compile(regex)
val m = p.matcher(key)
while (m.find()) {
split = m.group(2)
}
sb.append("(")
for (i <- Range(0, key.split("\\$").length - 1, 1)) {
if (i < key.split("\\$").length - 2) {
sb.append("[\\S]+[" + split + "]")
} else {
sb.append("[\\S]+")
}
}
sb.append(")\\s")
} else {
sb.append("([\\S]+)\\s")
}
case _ =>
val regex = "(\\W+)(\\$\\w+)(\\W+)"
val p = Pattern.compile(regex)
val m = p.matcher(key)
if (m.find) {
val pre = m.group(1)
val end = m.group(3)
sb.append("(" + escape(pre) + ".+" + escape(end) + ")\\s")
}
}
}else{
sb.append("(\\W+)\\s")
}
)
val str = sb.toString
str.substring(0, str.length - 2).concat("$")
}

def escape(original: String): String = {
val tb = new StringBuffer()
for (i <- Range(0, original.length(), 1)) {
if ("\"".equals(original.charAt(i).toString)) {
} else {
tb.append("\\")
}
tb.append(original.charAt(i))
}
tb.toString
}

def lineToGroup(line: String): ArrayBuffer[String] = {
val groups = ArrayBuffer[String]()
val p = Pattern.compile(regex)
val m = p.matcher(line)
while (m.find()) {
for (i <- Range(1, m.groupCount() + 1, 1)) {
groups.append(m.group(i))
}
}
groups
}

def main(args: Array[String]): Unit = {

cmd = "$domain $http_x_forwarded_for - $remote_user [$timestamp] \"$http_url\" \"$url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\""
regex=tran(cmd)
println(regex)
val log = "www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] \"GET /media/201408/2834414.shtm HTTP/1.1\" \"http://www.donews.com/media/201408/2834414.shtm\" 200 11296 \"-\" \"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)\" \"-\" \"China\" \"22\" \"Beijing\""
lineToGroup(log).foreach(x=>println(x))

}
}

输出结果:

^([\S]+)\s([\S]+)\s(\W+)\s([\S]+)\s(\[.+\])\s(".+")\s(".+")\s([\S]+)\s([\S]+)\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")$
www.donews.com
123.125.71.72
-
-
[28/Nov/2016:11:08:50 +0800]
"GET /media/201408/2834414.shtm HTTP/1.1"
"http://www.donews.com/media/201408/2834414.shtm"
200
11296
"-"
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
"-"
"China"
"22"
"Beijing"