思路
不使用正则表达式处理: 进行字符串切割 将[]和"括起的内容特殊处理 将每段数据转换为对应格式 代码精简,代码效率检查
import datetime
# 目标日志
logline = '''183.60.212.153 - - [19/Feb/2013:10:23:29 +0800] \
"GET /o2o/media.html?menu=3 HTTP/1.1" 200 16691 "-" \
"Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"'''
clean_log = logline.split()
# list
#['183.60.212.153', '-', '-', '[19/Feb/2013:10:23:29', '+0800]',\
# '"GET', '/o2o/media.html?menu=3', 'HTTP/1.1"', '200', '16691', \
# '"-"', '"Mozilla/5.0', '(compatible;', 'EasouSpider;', '+http://www.easou.com/search/spider.html)"']
# 转换时间格式
def convert_time(time:str):
return datetime.datetime.strptime(time, '%d/%b/%Y:%H:%M:%S %z')
# 将request字符串切分为三段
def convert_request(request:str):
return dict(zip(('method','url','protocol'),request.split()))
# 给予对应字段名
names = [
'remote','','','time',
'request','status','size','',
'useragent'
]
# 处理对应字段名的函数
operations = [
None,None,None,convert_time,
convert_request,int,int,None,
None
]
# 切割字符串为合适格式
def log_clean(line:str,ret=None):
if ret:
ret = []
tmp = ''
flag = False
for word in line.split():
if word.startswith('[') or word.startswith('"'):
tmp = word.strip('["')
if word.endswith('"') or word.endswith(']'):
ret.append(tmp)
flag = False
continue
flag = True
continue
if flag:
tmp += ' ' + word
if word.endswith('"') or word.endswith(']'):
ret.append(tmp.strip('"]'))
flag = False
continue
else:
ret.append(word)
# 遍历处理后日志,根据对应字段,进行对应处理后再保存至新字典中
ret_d = {}
log_clean(logline)
for i, field in enumerate(ret):
key = names[i]
if operations[i]:
ret_d[key] = operations[i](field)
else:
ret_d[key] = field
print(ret_d)