公司的监控系统主要采用了小米开源的falcon,上万台的物理机、虚拟机、以及容器,对监控系统的存储和性能有很大的挑战,在底层存储层选用了opentsdb,同时内部做了开发了tsdb-proxy模块将数据写入kafka和opentsdb,前期上线后,发现服务器内存占用率一直很高,增加机器后也没有得到很明显的改善,这时候考虑再次优化代码,在看具体代码前可以看一下相关监控。
falcon tsdb-prxoy集群机器内存使用情况,后面明显下降的地方是做了版本升级,发现4C8G的机器内存从50%多降低到20%多,效果非常明显。
第一部分代码是生产环境跑的代码convert2TsdbItem和convert2KafkaItem这两个函数会对其监控值计算计算2次分别计算出来给TSDB和Kafka的结构体,因为计算都是在内存中实现,所以对内存使用较大。
type Tsdb int func (this *Tsdb) Send(items []*cmodel.MetaData, resp *cmodel.SimpleRpcResponse) error { go handleItems(items) return nil } // 供外部调用、处理接收到的数据 的接口 func HandleItems(items []*cmodel.MetaData) error { handleItems(items) return nil } func handleItems(items []*cmodel.MetaData) { if items == nil { return } count := len(items) if count == 0 { return } cfg := g.Config() for i := 0; i < count; i++ { if items[i] == nil { continue } endpoint := items[i].Endpoint if !g.IsValidString(endpoint) { if cfg.Debug { log.Printf("invalid endpoint: %s", endpoint) } pfc.Meter("invalidEnpoint", 1) continue } counter := cutils.Counter(items[i].Metric, items[i].Tags) if !g.IsValidString(counter) { if cfg.Debug { log.Printf("invalid counter: %s/%s", endpoint, counter) } pfc.Meter("invalidCounter", 1) continue } dsType := items[i].CounterType step := items[i].Step checksum := items[i].Checksum() key := g.FormRrdCacheKey(checksum, dsType, step) //statistics proc.TsdbRpcRecvCnt.Incr() // To tsdb //first := store.TsdbItems.First(key) first := store.GetLastItem(key) if first == nil && items[i].Timestamp <= first.Timestamp { continue } tsdbItem,ok := convert2TsdbItem(first,items[i]) if ok { isSuccess := sender.TsdbQueue.PushFront(tsdbItem) if !isSuccess { proc.SendToTsdbDropCnt.Incr() } } kafkaItem,ok := convert2KafkaItem(first,items[i]) if ok { isSuccess := sender.KafkaQueue.PushFront(kafkaItem) if !isSuccess { proc.SendToKafkaDropCnt.Incr() } } //store.TsdbItems.PushFront(key, items[i], checksum, cfg) // To Index index.ReceiveItem(items[i], checksum) // To History store.AddItem(key, items[i]) } } //从内存索引、MySQL中删除counter,并从磁盘上删除对应rrd文件 func (this *Tsdb) Delete(params []*cmodel.GraphDeleteParam, resp *cmodel.GraphDeleteResp) error { resp = &cmodel.GraphDeleteResp{} for _, param := range params { err, tags := cutils.SplitTagsString(param.Tags) if err != nil { log.Error("invalid tags:", param.Tags, "error:", err) continue } var item *cmodel.MetaData = &cmodel.MetaData{ Endpoint: param.Endpoint, Metric: param.Metric, Tags: tags, CounterType: param.DsType, Step: int64(param.Step), } index.RemoveItem(item) } return nil } func convert2TsdbItem(f *cmodel.MetaData,d *cmodel.MetaData) (*cmodel.TsdbItem,bool) { t := cmodel.TsdbItem{Tags: make(map[string]string)} for k, v := range d.Tags { t.Tags[k] = v } host,exists := store.Hosts.Get(d.Endpoint) if exists { if host.AppName != "" { t.Tags["app_name"] = host.AppName } if host.GrpName != "" { t.Tags["grp_name"] = host.GrpName } if host.Room != "" { t.Tags["room"] = host.Room } if host.Env != "" { t.Tags["env"] = host.Env } } t.Tags["endpoint"] = d.Endpoint t.Metric = d.Metric t.Timestamp = d.Timestamp if d.CounterType == g.COUNTER { if f == nil { return &t,false } if f.Endpoint == "" { return &t,false } value := d.Value - f.Value if value < 0 { return &t,false } if d.Timestamp - f.Timestamp > d.Step + d.Step/2 { return &t,false } fv := float64(value)/float64(d.Step) t.Value = math.Floor(fv*1e3 + 0.5)*1e-3 return &t,true } if d.CounterType == g.DERIVE { if f == nil { return &t,false } if f.Endpoint == "" { return &t,false } if d.Timestamp - f.Timestamp > d.Step + d.Step/2 { return &t,false } value := d.Value - f.Value if value < 0 { return &t,false } fv := float64(value)/float64(d.Step) t.Value = math.Floor(fv*1e3 + 0.5)*1e-3 return &t,true } t.Value = d.Value return &t,true } func convert2KafkaItem(f *cmodel.MetaData,d *cmodel.MetaData) (*cmodel.KafkaItem,bool) { t := cmodel.KafkaItem{Tags: make(map[string]string)} for k, v := range d.Tags { t.Tags[k] = v } host,exists := store.Hosts.Get(d.Endpoint) if exists { t.AppName = host.AppName t.GrpName = host.GrpName t.Room = host.Room t.Env = host.Env } t.Endpoint = d.Endpoint t.Metric = d.Metric t.Timestamp = d.Timestamp t.Step = d.Step if d.CounterType == g.COUNTER { if f == nil { return &t,false } if f.Endpoint == "" { return &t,false } value := d.Value - f.Value if value < 0 { return &t,false } if d.Timestamp - f.Timestamp > d.Step + d.Step/2 { return &t,false } fv := float64(value)/float64(d.Step) t.Value = math.Floor(fv*1e3 + 0.5)*1e-3 return &t,true } if d.CounterType == g.DERIVE { if f == nil { return &t,false } if f.Endpoint == "" { return &t,false } if d.Timestamp - f.Timestamp > d.Step + d.Step/2 { return &t,false } value := d.Value - f.Value if value < 0 { return &t,false } fv := float64(value)/float64(d.Step) t.Value = math.Floor(fv*1e3 + 0.5)*1e-3 return &t,true } t.Value = d.Value return &t,true }
第二段代码是我优化的,优化思路是将convert2TsdbItem和convert2KafkaItem这两个函数中都用到的value提前计算好在需要的时候直接赋值,因为他们用的是一样的值
type Tsdb int func (this *Tsdb) Send(items []*cmodel.MetaData, resp *cmodel.SimpleRpcResponse) error { go handleItems(items) return nil } // 供外部调用、处理接收到的数据 的接口 func HandleItems(items []*cmodel.MetaData) error { handleItems(items) return nil } func handleItems(items []*cmodel.MetaData) { if items == nil { return } count := len(items) if count == 0 { return } cfg := g.Config() for i := 0; i < count; i++ { if items[i] == nil { continue } endpoint := items[i].Endpoint if !g.IsValidString(endpoint) { if cfg.Debug { log.Printf("invalid endpoint: %s", endpoint) } pfc.Meter("invalidEnpoint", 1) continue } counter := cutils.Counter(items[i].Metric, items[i].Tags) if !g.IsValidString(counter) { if cfg.Debug { log.Printf("invalid counter: %s/%s", endpoint, counter) } pfc.Meter("invalidCounter", 1) continue } dsType := items[i].CounterType step := items[i].Step checksum := items[i].Checksum() key := g.FormRrdCacheKey(checksum, dsType, step) //statistics proc.TsdbRpcRecvCnt.Incr() // To tsdb //first := store.TsdbItems.First(key) first := store.GetLastItem(key) if first == nil && items[i].Timestamp <= first.Timestamp { continue } value := 0.0 if items[i].CounterType == g.GAUGE { value = items[i].Value } else { ok :=true value,ok = computerValue(first,items[i]) if !ok { return } } //tsdb结构体 tsdbItem,ok := convert2TsdbItem(value,items[i]) if ok { isSuccess := sender.TsdbQueue.PushFront(tsdbItem) if !isSuccess { proc.SendToTsdbDropCnt.Incr() } } //kafka结构体 kafkaItem,ok := convert2KafkaItem(value,items[i]) if ok { isSuccess := sender.KafkaQueue.PushFront(kafkaItem) if !isSuccess { proc.SendToKafkaDropCnt.Incr() } } //store.TsdbItems.PushFront(key, items[i], checksum, cfg) // To Index index.ReceiveItem(items[i], checksum) // To History store.AddItem(key, items[i]) } } //从内存索引、MySQL中删除counter,并从磁盘上删除对应rrd文件 func (this *Tsdb) Delete(params []*cmodel.GraphDeleteParam, resp *cmodel.GraphDeleteResp) error { resp = &cmodel.GraphDeleteResp{} for _, param := range params { err, tags := cutils.SplitTagsString(param.Tags) if err != nil { log.Error("invalid tags:", param.Tags, "error:", err) continue } var item *cmodel.MetaData = &cmodel.MetaData{ Endpoint: param.Endpoint, Metric: param.Metric, Tags: tags, CounterType: param.DsType, Step: int64(param.Step), } index.RemoveItem(item) } return nil } func computerValue(f *cmodel.MetaData,d *cmodel.MetaData) (float64,bool){ //计算函数 计算好value结果返回后组装tsdb和kafka将结构体 resultvalue := 0.0 if d.CounterType == g.COUNTER { if f == nil { return resultvalue,false } if f.Endpoint == "" { return resultvalue,false } value := d.Value - f.Value if value < 0 { return resultvalue,false } if d.Timestamp - f.Timestamp > d.Step + d.Step/2 { return resultvalue,false } fv := float64(value)/float64(d.Step) resultvalue = math.Floor(fv*1e3 + 0.5)*1e-3 return resultvalue,true } if d.CounterType == g.DERIVE { if f == nil { return resultvalue,false } if f.Endpoint == "" { return resultvalue,false } if d.Timestamp - f.Timestamp > d.Step + d.Step/2 { return resultvalue,false } value := d.Value - f.Value if value < 0 { return resultvalue,false } fv := float64(value)/float64(d.Step) resultvalue = math.Floor(fv*1e3 + 0.5)*1e-3 return resultvalue,true } resultvalue = d.Value return resultvalue,true } func convert2TsdbItem(resultvalue float64,d *cmodel.MetaData) (*cmodel.TsdbItem,bool) { t := cmodel.TsdbItem{Tags: make(map[string]string)} for k, v := range d.Tags { t.Tags[k] = v } host,exists := store.Hosts.Get(d.Endpoint) if exists { if host.AppName != "" { t.Tags["app_name"] = host.AppName } if host.GrpName != "" { t.Tags["grp_name"] = host.GrpName } if host.Room != "" { t.Tags["room"] = host.Room } if host.Env != "" { t.Tags["env"] = host.Env } } t.Tags["endpoint"] = d.Endpoint t.Metric = d.Metric t.Timestamp = d.Timestamp t.Value = resultvalue return &t,true } func convert2KafkaItem(resultvalue float64,d *cmodel.MetaData) (*cmodel.KafkaItem,bool) { t := cmodel.KafkaItem{Tags: make(map[string]string)} for k, v := range d.Tags { t.Tags[k] = v } host,exists := store.Hosts.Get(d.Endpoint) if exists { t.AppName = host.AppName t.GrpName = host.GrpName t.Room = host.Room t.Env = host.Env } t.Endpoint = d.Endpoint t.Metric = d.Metric t.Timestamp = d.Timestamp t.Step = d.Step t.Value = resultvalue return &t,true } 第三段代码现在线上 type Tsdb int func (this *Tsdb) Send(items []*cmodel.MetaData, resp *cmodel.SimpleRpcResponse) error { go handleItems(items) return nil } // 供外部调用、处理接收到的数据 的接口 func HandleItems(items []*cmodel.MetaData) error { handleItems(items) return nil } func handleItems(items []*cmodel.MetaData) { if items == nil { return } count := len(items) if count == 0 { return } cfg := g.Config() for i := 0; i < count; i++ { if items[i] == nil { continue } endpoint := items[i].Endpoint if !g.IsValidString(endpoint) { if cfg.Debug { log.Printf("invalid endpoint: %s", endpoint) } pfc.Meter("invalidEnpoint", 1) continue } counter := cutils.Counter(items[i].Metric, items[i].Tags) if !g.IsValidString(counter) { if cfg.Debug { log.Printf("invalid counter: %s/%s", endpoint, counter) } pfc.Meter("invalidCounter", 1) continue } dsType := items[i].CounterType step := items[i].Step checksum := items[i].Checksum() key := g.FormRrdCacheKey(checksum, dsType, step) //statistics proc.TsdbRpcRecvCnt.Incr() // To tsdb //first := store.TsdbItems.First(key) first := store.GetLastItem(key) if first == nil && items[i].Timestamp <= first.Timestamp { continue } value,ok := computerValue(first,items[i]) if !ok { continue } tsdbItem := &cmodel.TsdbItem{ Metric: items[i].Metric, Value: value, Timestamp: items[i].Timestamp, Tags: make(map[string]string), } tsdbItem.Tags["endpoint"] = items[i].Endpoint for k, v := range items[i].Tags { tsdbItem.Tags[k] = v } kafkaItem := &cmodel.KafkaItem { Metric: items[i].Metric, Value: value, Timestamp: items[i].Timestamp, Step: items[i].Step, Endpoint: items[i].Endpoint, } host,exists := store.Hosts.Get(items[i].Endpoint) if exists { kafkaItem.AppName = host.AppName kafkaItem.GrpName = host.GrpName kafkaItem.Room = host.Room kafkaItem.Env = host.Env if host.AppName != "" { tsdbItem.Tags["app_name"] = host.AppName } if host.GrpName != "" { tsdbItem.Tags["grp_name"] = host.GrpName } if host.Room != "" { tsdbItem.Tags["room"] = host.Room } if host.Env != "" { tsdbItem.Tags["env"] = host.Env } } isSuccess := sender.TsdbQueue.PushFront(tsdbItem) if !isSuccess { proc.SendToTsdbDropCnt.Incr() } isSuccess = sender.KafkaQueue.PushFront(kafkaItem) if !isSuccess { proc.SendToKafkaDropCnt.Incr() } //store.TsdbItems.PushFront(key, items[i], checksum, cfg) // To Index index.ReceiveItem(items[i], checksum) // To History store.AddItem(key, items[i]) } } //从内存索引、MySQL中删除counter,并从磁盘上删除对应rrd文件 func (this *Tsdb) Delete(params []*cmodel.GraphDeleteParam, resp *cmodel.GraphDeleteResp) error { resp = &cmodel.GraphDeleteResp{} for _, param := range params { err, tags := cutils.SplitTagsString(param.Tags) if err != nil { log.Error("invalid tags:", param.Tags, "error:", err) continue } var item *cmodel.MetaData = &cmodel.MetaData{ Endpoint: param.Endpoint, Metric: param.Metric, Tags: tags, CounterType: param.DsType, Step: int64(param.Step), } index.RemoveItem(item) } return nil } func computerValue(f *cmodel.MetaData,d *cmodel.MetaData) (float64,bool){ //计算函数 计算好value结果返回后组装tsdb和kafka将结构体 resultvalue := 0.0 if d.CounterType == g.COUNTER { if f == nil { return resultvalue,false } if f.Endpoint == "" { return resultvalue,false } value := d.Value - f.Value if value < 0 { return resultvalue,false } if d.Timestamp - f.Timestamp > d.Step + d.Step/2 { return resultvalue,false } fv := float64(value)/float64(d.Step) resultvalue = math.Floor(fv*1e3 + 0.5)*1e-3 return resultvalue,true } if d.CounterType == g.DERIVE { if f == nil { return resultvalue,false } if f.Endpoint == "" { return resultvalue,false } if d.Timestamp - f.Timestamp > d.Step + d.Step/2 { return resultvalue,false } value := d.Value - f.Value if value < 0 { return resultvalue,false } fv := float64(value)/float64(d.Step) resultvalue = math.Floor(fv*1e3 + 0.5)*1e-3 return resultvalue,true } resultvalue = d.Value return resultvalue,true }