为什么要做监控?
–熟悉IT监控系统的设计原理
–开发一个简版的类Zabbix监控系统
–掌握自动化开发项目的程序设计思路及架构解藕原则
常用监控系统设计讨论
Zabbix
Nagios
监控系统需求讨论
1.可监控常用系统服务、应用、网络设备等
2.一台主机上可监控多个不同服务、不同服务的监控间隔可不同
3.同一个服务在不同主机上的监控间隔、报警阈值可不同
4.可以批量的给一批主机添加、删除、修改要监控的服务
5.告警级别:
- 不同的服务 因为业务重要程度不同,如果出了问题可以设置不同的报警级别
- 可以指定特定的服务或告警级别的事件通知给特定的用户
- 告警的升级设定
6.历史数据 的存储和优化
- 实现用最少的空间占用量存储最多的有效数据
- 如何做到1s中之内取出一台主机上所有服务的5年的监控数据?
7. 数据可视化,如何做出简洁美观的用户界面?
8.如何实现单机支持5000+机器监控需求?
9.采取何种通信方式?主动、被动?
10.如何实现监控服务器的水平扩展?
采用什么架构?
•Mysql
•主动通信? Snmp,wget…
•被动通信?Agent ---how to communicate with the monitor server
•Socket server –> Sockect client
•能否用现成的c/s架构? Rabbit mq, redis 订阅发布, http ?
采用HTTP好处
1.接口设计简单
2.容易水平扩展做分布式
3.Socket稳定成熟,省去较多的通信维护精力
Http特性:
1.短连接
2.无状态
3.安全认证
4.被动通信
监控系统架构设计
1 #!_*_coding:utf8_*_
2 from django.db import models
3
4 # Create your models here.
5
6
7
8
9 class Host(models.Model):
10 name = models.CharField(max_length=64,unique=True)
11 ip_addr = models.GenericIPAddressField(unique=True)
12 host_groups = models.ManyToManyField('HostGroup',blank=True) # A B C
13 templates = models.ManyToManyField("Template",blank=True) # A D E
14 monitored_by_choices = (
15 ('agent','Agent'),
16 ('snmp','SNMP'),
17 ('wget','WGET'),
18 )
19 monitored_by = models.CharField(u'监控方式',max_length=64,choices=monitored_by_choices)
20 status_choices= (
21 (1,'Online'),
22 (2,'Down'),
23 (3,'Unreachable'),
24 (4,'Offline'),
25 )
26 status = models.IntegerField(u'状态',choices=status_choices,default=1)
27 memo = models.TextField(u"备注",blank=True,null=True)
28
29 def __unicode__(self):
30 return self.name
31
32 class HostGroup(models.Model):
33 name = models.CharField(max_length=64,unique=True)
34 templates = models.ManyToManyField("Template",blank=True)
35 memo = models.TextField(u"备注",blank=True,null=True)
36 def __unicode__(self):
37 return self.name
38
39 class ServiceIndex(models.Model):
40 name = models.CharField(max_length=64)
41 key =models.CharField(max_length=64)
42 data_type_choices = (
43 ('int',"int"),
44 ('float',"float"),
45 ('str',"string")
46 )
47 data_type = models.CharField(u'指标数据类型',max_length=32,choices=data_type_choices,default='int')
48 memo = models.CharField(u"备注",max_length=128,blank=True,null=True)
49 def __unicode__(self):
50 return "%s.%s" %(self.name,self.key)
51
52 class Service(models.Model):
53 name = models.CharField(u'服务名称',max_length=64,unique=True)
54 interval = models.IntegerField(u'监控间隔',default=60)
55 plugin_name = models.CharField(u'插件名',max_length=64,default='n/a')
56 items = models.ManyToManyField('ServiceIndex',verbose_name=u"指标列表",blank=True)
57 memo = models.CharField(u"备注",max_length=128,blank=True,null=True)
58
59 def __unicode__(self):
60 return self.name
61 #def get_service_items(obj):
62 # return ",".join([i.name for i in obj.items.all()])
63
64 class Template(models.Model):
65 name = models.CharField(u'模版名称',max_length=64,unique=True)
66 services = models.ManyToManyField('Service',verbose_name=u"服务列表")
67 triggers = models.ManyToManyField('Trigger',verbose_name=u"触发器列表",blank=True)
68 def __unicode__(self):
69 return self.name
70 '''
71 class TriggerExpression(models.Model):
72 name = models.CharField(u"触发器表达式名称",max_length=64,blank=True,null=True)
73 service = models.ForeignKey(Service,verbose_name=u"关联服务")
74 service_index = models.ForeignKey(ServiceIndex,verbose_name=u"关联服务指标")
75 logic_type_choices = (('or','OR'),('and','AND'))
76 logic_type = models.CharField(u"逻辑关系",choices=logic_type_choices,max_length=32,blank=True,null=True)
77 left_sibling = models.ForeignKey('self',verbose_name=u"左边条件",blank=True,null=True,related_name='left_sibling_condition' )
78 operator_type_choices = (('eq','='),('lt','<'),('gt','>'))
79 operator_type = models.CharField(u"运算符",choices=operator_type_choices,max_length=32)
80 data_calc_type_choices = (
81 ('avg','Average'),
82 ('max','Max'),
83 ('hit','Hit'),
84 ('last','Last'),
85 )
86 data_calc_func= models.CharField(u"数据处理方式",choices=data_calc_type_choices,max_length=64)
87 data_calc_args = models.CharField(u"函数传入参数",help_text=u"若是多个参数,则用,号分开,第一个值是时间",max_length=64)
88 threshold = models.IntegerField(u"阈值")
89
90 def __unicode__(self):
91 return "%s %s(%s(%s))" %(self.service_index,self.operator_type,self.data_calc_func,self.data_calc_args)
92 '''
93
94
95 class TriggerExpression(models.Model):
96 #name = models.CharField(u"触发器表达式名称",max_length=64,blank=True,null=True)
97 trigger = models.ForeignKey('Trigger',verbose_name=u"所属触发器")
98 service = models.ForeignKey(Service,verbose_name=u"关联服务")
99 service_index = models.ForeignKey(ServiceIndex,verbose_name=u"关联服务指标")
100 specified_index_key = models.CharField(verbose_name=u"只监控专门指定的指标key",max_length=64,blank=True,null=True)
101 operator_type_choices = (('eq','='),('lt','<'),('gt','>'))
102 operator_type = models.CharField(u"运算符",choices=operator_type_choices,max_length=32)
103 data_calc_type_choices = (
104 ('avg','Average'),
105 ('max','Max'),
106 ('hit','Hit'),
107 ('last','Last'),
108 )
109 data_calc_func= models.CharField(u"数据处理方式",choices=data_calc_type_choices,max_length=64)
110 data_calc_args = models.CharField(u"函数传入参数",help_text=u"若是多个参数,则用,号分开,第一个值是时间",max_length=64)
111 threshold = models.IntegerField(u"阈值")
112
113
114 logic_type_choices = (('or','OR'),('and','AND'))
115 logic_type = models.CharField(u"与一个条件的逻辑关系",choices=logic_type_choices,max_length=32,blank=True,null=True)
116 #next_condition = models.ForeignKey('self',verbose_name=u"右边条件",blank=True,null=True,related_name='right_sibling_condition' )
117 def __unicode__(self):
118 return "%s %s(%s(%s))" %(self.service_index,self.operator_type,self.data_calc_func,self.data_calc_args)
119 class Meta:
120 pass #unique_together = ('trigger_id','service')
121
122 class Trigger(models.Model):
123 name = models.CharField(u'触发器名称',max_length=64)
124 #expressions= models.TextField(u"表达式")
125 severity_choices = (
126 (1,'Information'),
127 (2,'Warning'),
128 (3,'Average'),
129 (4,'High'),
130 (5,'Diaster'),
131 )
132 #expressions = models.ManyToManyField(TriggerExpression,verbose_name=u"条件表达式")
133 severity = models.IntegerField(u'告警级别',choices=severity_choices)
134 enabled = models.BooleanField(default=True)
135 memo = models.TextField(u"备注",blank=True,null=True)
136
137 def __unicode__(self):
138 return "<serice:%s, severity:%s>" %(self.name,self.get_severity_display())
139
140
141
142 class Action(models.Model):
143 name = models.CharField(max_length=64,unique=True)
144 host_groups = models.ManyToManyField('HostGroup',blank=True)
145 hosts = models.ManyToManyField('Host',blank=True)
146
147 conditions = models.TextField(u'告警条件')
148 interval = models.IntegerField(u'告警间隔(s)',default=300)
149 operations = models.ManyToManyField('ActionOperation')
150
151 recover_notice = models.BooleanField(u'故障恢复后发送通知消息',default=True)
152 recover_subject = models.CharField(max_length=128,blank=True,null=True)
153 recover_message = models.TextField(blank=True,null=True)
154
155 enabled = models.BooleanField(default=True)
156
157 def __unicode__(self):
158 return self.name
159
160 class ActionOperation(models.Model):
161 name = models.CharField(max_length=64)
162 step = models.SmallIntegerField(u"第n次告警",default=1)
163 action_type_choices = (
164 ('email','Email'),
165 ('sms','SMS'),
166 ('script','RunScript'),
167 )
168 action_type = models.CharField(u"动作类型",choices=action_type_choices,default='email',max_length=64)
169 #notifiers= models.ManyToManyField(host_models.UserProfile,verbose_name=u"通知对象",blank=True)
170 def __unicode__(self):
171 return self.name
172
173
174 class Maintenance(models.Model):
175 name = models.CharField(max_length=64,unique=True)
176 hosts = models.ManyToManyField('Host',blank=True)
177 host_groups = models.ManyToManyField('HostGroup',blank=True)
178 content = models.TextField(u"维护内容")
179 start_time = models.DateTimeField()
180 end_time = models.DateTimeField()
181
182 def __unicode__(self):
183 return self.name
184
185 ''''
186 CPU
187 idle 80
188 usage 90
189 system 30
190 user
191 iowait 50
192
193 memory :
194 usage
195 free
196 swap
197 cache
198 buffer
199
200 load:
201 load1
202 load 5
203 load 15
204 '''
表设计结构