之前在做CDN运维的时候,因为业务的特殊性(跨机房,跨ISP,跨区域),把日志集中传输到一个中心来做qos的分析不太现实,因此采用的方法是每5分钟对Nginx日志进行切割,然后通过Python程序计算http code的分布,并通过Zabbix来实现单台机器Nginx qos的监控,配合对Zabbix数据库的Lastvalue进行聚合,则可以监控整个CDN的流量,qos数据等,这样一般发现问题的延迟就在5分钟左右(cdn的qos敏感性不是很强),配合rsync+hadoop+hive来计算nginx的日志,也可以得到更加详细的各个维度的分析(离线数据分析),下面贴下Nginx日志使用的分析脚本:

Nginx Httpcode分析脚本_Nginx Httpcode分析脚本

先贴下zabbix聚合脚本:

  1. #!/usr/bin/python

  2. #to get webcdn totaol statistics

  3. # -*- coding: utf8 -*-

  4. import MySQLdb

  5. import sys

  6. import os

  7. def get_total_value(sql):

  8. db = MySQLdb.connect(host='xxxx',user='xxxx',passwd='xxxx',db='xxxx')

  9. cursor = db.cursor()

  10.    cursor.execute(sql)

  11.    try:  

  12. result = cursor.fetchone()[0]

  13.    except:

  14. result = 0

  15.    cursor.close()

  16.    db.close()

  17.    return result

  18. if __name__ == '__main__':

  19. sql = ''

  20.    if sys.argv[1] == "network_traffic":

  21. sql = "select round(sum(lastvalue)/(1024*1024),4) from  hosts a, items b   where key_ in ( 'net.if.out[eth1,bytes]','net.if.out[eth0,bytes]') and lower(host) like '%-cdn-cache%'  and a.hostid = b.hostid"

  22.    elif sys.argv[1] == "nginx_traffic":

  23. sql = "select sum(lastvalue) from  hosts a, items b   where key_ = 'log_webcdn_getvalue[traffic]'   and lower(host) like '%cdn-cache%'  and a.hostid = b.hostid"

  24.    elif sys.argv[1] == "2xxand3xx":

  25. sql = "select sum(lastvalue) from  hosts a, items b   where key_ in ( 'log_webcdn_getvalue[200]','log_webcdn_getvalue[300]') and lower(host) like '%-cdn-cache%'  and a.hostid = b.hostid"

  26.    elif sys.argv[1] == "4xxand5xx":

  27. sql = "select sum(lastvalue) from  hosts a, items b   where key_ in ( 'log_webcdn_getvalue[four]','log_webcdn_getvalue[five]') and lower(host) like '%-cdn-cache%'  and a.hostid = b.hostid"

  28.    elif sys.argv[1] == "network_ss":

  29. sql = "select sum(lastvalue) from  hosts a, items b   where key_ = 'network_conn' and lower(host) like '%-cdn-cache%'  and a.hostid = b.hostid"

  30.    else:

  31.        sys.exit(0)

  32. #    print sql

  33. value = get_total_value(sql)

  34.    print value

然后是单台的分析脚本:

  1. #!/usr/bin/python

  2. #coding=utf-8

  3. from __future__ import division

  4. import subprocess, signal,string

  5. import codecs

  6. import re

  7. import os

  8. import time, datetime

  9. import sys

  10. def show_usage():

  11.    print """

  12.        python nginx_log_wedcdn.py result_key

  13.        result_key could be:

  14.        average_bodysize, response_time, sum_count, count_success, four, 403, 404, 499, five, 500, 502, 503, 200, 300, requests_second

  15.        response_time_source, percentage_time_1, percentage_time_3, all

  16.          """

  17. def runCmd(command, timeout = 10):

  18. start = datetime.datetime.now()

  19. process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)

  20.    while process.poll() is None:

  21.        time.sleep(0.2)

  22. now = datetime.datetime.now()

  23.        if (now - start).seconds > timeout:

  24.            os.kill(process.pid, signal.SIGKILL)

  25.            os.waitpid(-1, os.WNOHANG)

  26.            return None

  27.    return process.stdout.readlines()

  28. def get_old_filename():

  29. t = datetime.datetime.now() + datetime.timedelta(minutes = -5)

  30. a = t.strftime('%Y-%m-%d-%H')

  31. b = t.strftime('%M')

  32. b = int(b)//5*5

  33.    if b <10:

  34. c = "0" + str(b)

  35.    else:

  36. c = str(b)

  37. d = "/log/nginx/old/" + a + "-%s.log.gz" % c

  38.    #print d

  39.    return d

  40. def get_new_filename():

  41. t = datetime.datetime.now() + datetime.timedelta(minutes = -5)

  42. a = t.strftime('%Y-%m-%d-%H')

  43. b = t.strftime('%M')

  44. b = int(b)//5*5

  45.    if b <10:

  46. c = "0" + str(b)

  47.    else:

  48. c = str(b)

  49. d = "/log/nginx/old/" + a + "-%s.log" % c

  50.    #print d

  51.    return d

  52. def get_new2_filename():

  53. t = datetime.datetime.now() + datetime.timedelta(minutes = -5)

  54. a = t.strftime('%Y-%m-%d-%H')

  55. b = t.strftime('%M')

  56. b = int(b)//5*5

  57.    if b <10:

  58. c = "0" + str(b)

  59.    else:

  60. c = str(b)

  61. d = "/log/nginx/new/" + a + "-%s.log" % c

  62.    #print d

  63.    return d

  64. def average_flow():

  65. flow = 0

  66. flow1 = 0

  67. flow_ppsucai = 0

  68. flow_asimgs = 0

  69. flow_static9 = 0

  70. traffic = 0.0

  71. traffic1 = 0.0

  72. count = 0

  73. count_sucai = 0

  74. count_sucai_100 = 0

  75. count_sucai_30_100 = 0

  76. count_sucai_30 = 0

  77. count_asimgs = 0

  78. count_asimgs_100 = 0

  79. count_asimgs_30_100 = 0

  80. count_asimgs_30 = 0

  81. count_static9 = 0

  82. count_static9_100 = 0

  83. count_static9_30_100 = 0

  84. count_static9_30 = 0

  85. sum_time = 0.0

  86. sum_ppsucai_time = 0.0

  87. sum_asimgs_time = 0.0

  88. sum_static9_time = 0.0

  89. sum_time_source = 0.0

  90. count_200 = 0

  91. count_300 = 0

  92. count_success = 0

  93. count_200_backup = 0

  94. count_not_200_backup = 0

  95. id_list_200 = [200,206]

  96. id_list_300 = [300,301,302,303,304,305,306,307]

  97. id_list_success = [200,206,300,301,302,303,304,305,306,307]

  98. data_byte = 0

  99. elapsed = 0.0

  100. response_time = 0.0

  101. response_time_source = 0.0

  102. requests_second = 0.0

  103. requests_second_sucai = 0.0

  104. requests_second_asimgs = 0.0

  105. list_time_1 = []

  106. list_time_3 = []

  107. list_ip_403 = []

  108. list_ip_404 = []

  109. list_ip_415 = []

  110. list_ip_499 = []

  111. list_ip_500 = []

  112. list_ip_502 = []

  113. list_ip_503 = []

  114. server_list = ['"127.0.0.1:8080"','"127.0.0.1:8081"','"-"']

  115. file_name = get_old_filename()

  116.    if os.path.isfile("%s" % file_name):

  117.        Writelog(file_name)

  118. i = os.popen("/bin/zcat %s" % file_name).readlines()

  119.        #i = gzip.GzipFile("%s" % file_name).readlines()

  120.    else:

  121. file_name = get_new_filename()

  122.        if os.path.isfile("%s" % file_name):

  123.            Writelog(file_name)

  124. i = os.popen("/bin/cat %s" % file_name).readlines()

  125.        else:

  126.            #time.sleep(15)

  127. file_name = get_new2_filename()

  128.            if os.path.isfile("%s" % file_name):

  129.                Writelog(file_name)

  130. i = os.popen("/bin/cat %s" % file_name).readlines()

  131.            else:

  132.                os.popen("rm -f /tmp/exist.txt")

  133.                sys.exit(1)

  134.    for line in i:

  135.            count += 1

  136.            try:

  137. domain_name = line.split()[1]

  138.            except:

  139.                pass  

  140.            try:

  141. web_code = int(line.split()[8])

  142.            except:

  143. web_code = 888

  144.            try:

  145. IP = str(line.split()[0])

  146.            except:

  147.                pass

  148.            try:  

  149. data_byte = int(line.split()[9])

  150.                #print "data", data_byte

  151.            except:

  152. data_byte = 0.0001

  153.            try:

  154. elapsed = float(line.split()[-1].strip('"'))

  155.                if elapsed == 0.000:

  156. elapsed = 0.0001

  157.            except:

  158. elapsed = 0.0001

  159.            try:  

  160. time_source = float(line.split()[-4].strip('"'))

  161.            except:

  162. time_source = 0.0

  163.            try:

  164. backup_server =  str(line.split()[-3])

  165.            except:

  166.                pass

  167.            flow1 += data_byte

  168.            if web_code in id_list_success:

  169.                flow += data_byte

  170.                sum_time_source += time_source

  171.                if domain_name != "ppsucai.pptv.com":

  172.                    sum_time += elapsed

  173.                else:

  174.                    #print domain_name

  175.                    sum_time += 0.000

  176.            if web_code in id_list_200:

  177.                #print web_code

  178.                count_200 += 1

  179.                if backup_server not in server_list:

  180.                    #print web_code, backup_server

  181.                    count_200_backup += 1

  182.            elif web_code == 200 and date_byte == 0:

  183.                #print line.split()[3].lstrip("[")

  184.                WriteURLInfo(line.split()[3].lstrip("["))

  185.                WriteURLInfo("\t")

  186.                WriteURLInfo(line.split()[10])

  187.                WriteURLInfo("\n")

  188.            elif web_code in id_list_300:

  189.                count_300 += 1

  190.            elif web_code == 403 and IP not in list_ip_403:

  191.                list_ip_403.append(IP)

  192.                #print "this is the sum 403 count:", IP, len(list_ip_403)

  193.            elif web_code == 404 and IP not in list_ip_404:

  194.                list_ip_404.append(IP)

  195.                #print "this is the sum 404 count:", IP, len(list_ip_404)

  196.            elif web_code == 415 and IP not in list_ip_415:

  197.                list_ip_415.append(IP)

  198.                #print "this is the sum 415 count:", IP, len(list_ip_415)

  199.            elif web_code == 499 and IP not in list_ip_499:

  200.                list_ip_499.append(IP)

  201.                #print "this is the sum 499 count:", IP, len(list_ip_499)

  202.            elif web_code == 500 and IP not in list_ip_500:

  203.                list_ip_500.append(IP)

  204.                #print "this is the sum 500 count:", IP, len(list_ip_500)

  205.            elif web_code == 502 and IP not in list_ip_502:

  206.                list_ip_502.append(IP)

  207.                #print "this is the sum 502 count:", IP, len(list_ip_502)

  208.            elif web_code == 503 and IP not in list_ip_503:

  209.                list_ip_503.append(IP)

  210.                #print "this is the sum 503 count:", IP, len(list_ip_503)

  211.            if web_code not in id_list_200 and backup_server not in server_list:

  212.                #print web_code, backup_server

  213.                count_not_200_backup += 1


  214.            if elapsed > 1.0 and web_code in id_list_success and IP not in list_time_1:

  215.                list_time_1.append(IP)

  216.            elif elapsed > 3.0 and web_code in id_list_success and IP not in list_time_3:

  217.                list_time_3.append(IP)


  218.            if domain_name == "ppsucai.pptv.com" and web_code in id_list_success:

  219. download_speed_sucai = round(data_byte / elapsed / 1024, 2)

  220.                flow_ppsucai += data_byte

  221.                sum_ppsucai_time += elapsed

  222.                count_sucai += 1

  223.                if download_speed_sucai >= 100:

  224.                    count_sucai_100 += 1

  225.                elif download_speed_sucai <100 and download_speed_sucai >= 30:

  226.                    count_sucai_30_100 += 1

  227.                else:

  228.                    count_sucai_30 += 1

  229.            elif domain_name == "asimgs.pplive.cn" and web_code in id_list_success:

  230. download_speed_asimgs = round(data_byte / elapsed / 1024, 2)

  231.                flow_asimgs += data_byte

  232.                sum_asimgs_time += elapsed

  233.                count_asimgs += 1

  234.                if download_speed_asimgs >= 100:

  235.                    count_asimgs_100 += 1

  236.                elif download_speed_asimgs <100 and download_speed_asimgs >= 30:

  237.                    count_asimgs_30_100 += 1

  238.                else:

  239.                    count_asimgs_30 += 1

  240.            elif domain_name == "static9.pplive.cn" and web_code in id_list_success:

  241. download_speed_static9 = round(data_byte / elapsed / 1024, 2)

  242.                flow_static9 += data_byte

  243.                sum_static9_time += elapsed

  244.                count_static9 += 1

  245.                if download_speed_static9 >= 100:

  246.                    count_static9_100 += 1

  247.                elif download_speed_static9 <100 and download_speed_static9 >= 30:

  248.                    count_static9_30_100 += 1

  249.                else:

  250.                    count_static9_30 += 1

  251.        #else:

  252.            #break

  253.    try:

  254. traffic = round((flow*1.07*8)/300/1024/1024, 2)

  255.        #traffic1 = round((flow1*1.07)/300/1024/1024, 2)

  256.        #print traffic, traffic1

  257.        #traffic1 = round(flow/sum_time/1024/1024, 2)

  258. count_success = count_200 + count_300

  259. response_time = round(sum_time/count_success, 2)

  260. response_time_source = round(sum_time_source/count_success, 2)

  261. requests_second = round(count_success/300, 2)

  262.        if sum_ppsucai_time == 0.0:

  263. sum_ppsucai_time = 0.0001

  264.        if sum_asimgs_time == 0.0:

  265. sum_asimgs_time = 0.0001

  266.        #print sum_static9_time

  267.        if sum_static9_time == 0.0:

  268. sum_static9_time = 0.0001

  269. traffic_ppsucai = round(flow_ppsucai/sum_ppsucai_time/1024, 2)

  270. traffic_asimgs = round(flow_asimgs/sum_asimgs_time/1024, 2)

  271. traffic_static9 = round(flow_static9/sum_static9_time/1024, 2)

  272.        #print "flow_static:", flow_static9, "traffic_static9", traffic_static9

  273. average_bodysize = round((flow/count_success)/1024, 2)

  274. percentage_time_1 = round(len(list_time_1)/count_success*100, 2)

  275. percentage_time_3 = round(len(list_time_3)/count_success*100, 2)

  276.        if count_sucai == 0:

  277. count_sucai = 0.0001

  278. percentage_sucai_100 = round(count_sucai_100/count_sucai*100, 2)

  279. percentage_sucai_30_100 = round(count_sucai_30_100/count_sucai*100, 2)

  280. percentage_sucai_30 = round(count_sucai_30/count_sucai*100, 2)

  281.        if count_asimgs == 0:

  282. count_asimgs = 0.0001

  283. percentage_asimgs_100 = round(count_asimgs_100/count_asimgs*100, 2)

  284. percentage_asimgs_30_100 = round(count_asimgs_30_100/count_asimgs*100, 2)

  285. percentage_asimgs_30 = round(count_asimgs_30/count_asimgs*100, 2)

  286.        #print count_static9

  287.        if count_static9 == 0:

  288. count_static9 = 0.0001

  289. percentage_static9_100 = round(count_static9_100/count_static9*100, 2)

  290.        #print count_static9_100, "100", percentage_static9_100

  291. percentage_static9_30_100 = round(count_static9_30_100/count_static9*100, 2)

  292.        #print count_static9_30_100, "30-100", percentage_static9_30_100

  293. percentage_static9_30 = round(count_static9_30/count_static9*100, 2)

  294.        #print count_static9_30, "30", percentage_static9_30

  295. requests_second_sucai = round(count_sucai/300, 2)

  296. requests_second_asimgs = round(count_asimgs/300, 2)

  297. requests_second_static9 = round(count_static9/300, 2)

  298.        #print requests_second_static9

  299.        #print count, "this is the count of 2xx_backup:", count_200_backup,"%", round(count_200_backup/count, 4),"this is the count of !2xx_backup:", count_not_200_backup, round(count_not_200_backup/count, 4)

  300. percentage_200_backup = round(count_200_backup/count*100, 2)

  301. percentage_not_200_backup = round(count_not_200_backup/count*100, 2)

  302.        return average_bodysize, response_time, count, count_success, len(list_ip_403), len(list_ip_404), len(list_ip_499), len(list_ip_500), len(list_ip_502), len(list_ip_503), count_200, count_300, requests_second, response_time_source, len(list_time_1), len(list_time_3), percentage_time_1, percentage_time_3,count_sucai,percentage_sucai_100, percentage_sucai_30_100, percentage_sucai_30, requests_second_sucai, count_asimgs, percentage_asimgs_100, percentage_asimgs_30_100, percentage_asimgs_30, requests_second_asimgs, traffic_ppsucai, traffic_asimgs, traffic, traffic_static9, count_static9, percentage_static9_100, percentage_static9_30_100, percentage_static9_30, requests_second_static9, percentage_200_backup, percentage_not_200_backup, len(list_ip_415)

  303.    except:

  304.        return 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

  305. def log_files(pwd):

  306. log_file_list = []

  307. files = os.popen("ls %s" % pwd).readlines()

  308.        for x in files:

  309.                if x.strip().endswith("log"):

  310.                        log_file_list.append(x.strip())

  311.        return log_file_list

  312. def result_dic():

  313. list = average_flow()

  314.    #print list

  315. #   print list

  316. result = {}

  317.    result['average_bodysize'] = list[0]

  318.    result['response_time'] = list[1]

  319.    result['sum_count'] = list[2]

  320.    result['count_success'] = list[3]

  321.    result['four'] = list[4] + list[5] + list[6] + list[39]

  322. #   print 'four','=','%s' % list[4],'+','%s' % list[5],'+','%s' % list[6],'+','%s' % list[39], result['four']

  323.    result['403'] = list[4]

  324. #   print '403', result['403']

  325.    result['404'] = list[5]

  326. #   print '404', result['404']

  327.    result['499'] = list[6]

  328. #   print '499', result['499']

  329.    result['415'] = list[39]

  330. #   print '415', result['415']

  331.    result['five'] = list[7] + list[8] + list[9]

  332.    result['500'] = list[7]

  333.    result['502'] = list[8]

  334.    result['503'] = list[9]

  335.    result['200'] = list[10]

  336.    result['300'] = list[11]

  337.    result['requests_second'] = list[12]

  338.    result['response_time_source'] = list[13]

  339.    result['percentage_time_1'] = list[16]

  340.    result['percentage_time_3'] = list[17]

  341.    result['count_sucai'] = list[18]

  342.    result['percentage_sucai_100'] = list[19]

  343.    result['percentage_sucai_30_100'] = list[20]

  344.    result['percentage_sucai_30'] = list[21]

  345.    result['requests_second_sucai'] = list[22]

  346.    result['count_asimgs'] = list[23]

  347.    result['percentage_asimgs_100'] = list[24]

  348.    result['percentage_asimgs_30_100'] = list[25]

  349.    result['percentage_asimgs_30'] = list[26]

  350.    result['requests_second_asimgs'] = list[27]

  351.    result['traffic_ppsucai'] = list[28]

  352.    result['traffic_asimgs'] = list[29]

  353.    result['traffic'] = list[30]

  354.    result['traffic_static9'] = list[31]

  355.    result['count_static9'] = list[32]

  356.    result['percentage_static9_100'] = list[33]

  357.    result['percentage_static9_30_100'] = list[34]

  358.    result['percentage_static9_30'] = list[35]

  359.    result['requests_second_static9'] = list[36]

  360.    result['percentage_200_backup'] = list[37]

  361.    result['percentage_not_200_backup'] = list[38]

  362.    result['all'] = list

  363.    return result

  364. def Writelog(msg):

  365. o = open("/log/nginx/qos_result_new"+".log","aw")

  366.    o.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ":" + msg + "\n")

  367.    o.close()

  368. def WriteTmpInfo(msg):

  369. o = open("/tmp/webcdnqos_result"+".txt","aw+")

  370.    o.write(msg+"\n")

  371.    o.close()

  372. def WriteURLInfo(msg):

  373. today = datetime.date.today()

  374. o = open("/tmp/webcdnqos_url_%s" % today.strftime('%Y-%m-%d') + ".log","aw")

  375. #   o.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "    " +msg+"\n")

  376.    o.write(msg)

  377.    o.close()

  378. if __name__ == "__main__":

  379.    if len(sys.argv) <2:

  380.        show_usage()

  381.        os.popen("rm -f /tmp/exist.txt")

  382.            sys.exit(1)

  383.    else:

  384.        if os.path.isfile("/tmp/exist.txt"):

  385.            sys.exit(1)

  386.        else:

  387.            os.popen("echo 'hello' > /tmp/exist.txt")

  388. result_key = sys.argv[1]

  389. status = result_dic()

  390.            os.popen(">/tmp/webcdnqos_result.txt")

  391.            print status[result_key]

  392.            Writelog(str(status[result_key]))

  393.            for i in status.keys():

  394.                WriteTmpInfo(str(i)+"="+str(status[i]))

  395.            os.popen("rm -f /tmp/exist.txt")