2021SC@SDUSC
L2Agent运行机制
本文内容的前半部分(daemon_loop之前)后半部分来自对源码的自主阅读。
概述
L2Agent通常运行在Hypervisor,与neutron-server通过RPC通信,监听并通知设备的变化,创建新的设备来确保网络segment的正确性,应用security groups规则等。例如,OVS Agent,使用Open vSwitch来实现VLAN, GRE,VxLAN来实现网络的隔离,还包括了网络流量的转发控制。
源码聚焦
neutron-18.1.1/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py
main函数
def main(bridge_classes):
ovs_capabilities.register()
ext_manager.register_opts(cfg.CONF)
agent_config.setup_privsep()
service_conf.register_service_opts(service_conf.RPC_EXTRA_OPTS, cfg.CONF)
ext_mgr = ext_manager.L2AgentExtensionsManager(cfg.CONF)
# now that all extensions registered their options, we can log them
n_utils.log_opt_values(LOG)
validate_tunnel_config(cfg.CONF.AGENT.tunnel_types, cfg.CONF.OVS.local_ip)
init_try = 1
while True:
try:
agent = OVSNeutronAgent(bridge_classes, ext_mgr, cfg.CONF)
capabilities.notify_init_event(n_const.AGENT_TYPE_OVS, agent)
break
except ovs_exceptions.TimeoutException as e:
if init_try < INIT_MAX_TRIES:
LOG.warning("Ovsdb command timeout!")
init_try += 1
else:
LOG.error("%(err)s agent terminated after %(attempts)s "
"initialization attempts!",
{'err': e, 'attempts': init_try})
sys.exit(1)
except (RuntimeError, ValueError) as e:
LOG.error("%s agent terminated!", e)
sys.exit(1)
agent.daemon_loop()
图解:
前面做了一些准备工作,最重要的是OVS Agent的初始化,这个函数里面调用self.setup_integration_br()完成:安装integration网桥、创建patch ports并移除所有现有的流规则、添加基本的流规则;调用self.setup_rpc()完成设置用来与neutron-server通信的plugin_rpc、设置用于agent状态信息上报的state_rpc、设置用于接收neutron-server的消息connection、并且启动心跳的周期上报,周期默认为30s(Neutron server端启动了rpc_listeners,对agent发过来的消息进行监听,对于心跳的监听,是如果接收到心跳信号,就会对数据库中的时间戳进行更新,如果一直不更新时间戳,当前时间减去更新的时间戳,如果超过默认的agent_down_time=75s,则认为agent处于down的状态);调用self.setup_physical_bridges(self.bridge_mappings)完成完成了物理网桥br-eth*的创建,创建好网桥之后,与安装br-int一样,首先删除了现有的所有流规则,并添加了同样为normal的流规则,用以转发消息,接下来是与br-int不同的地方,根据use_veth_interconnection决定是否使用veth与br-int进行连接,并配置veth或者patch port,然后通过设置drop流规则,封锁桥之间的通信,然后使能veth或者patch ports进行通信;其余函数对DVR Agent(分布式路由代理)和Security Group Agent(安全组代理)的初始化工作,用于处理DVR和security group。(再次感谢)
最后,开始daemon_loop,从此开始agent正式工作:
def daemon_loop(self):
# Start everything.
LOG.info("Agent initialized successfully, now running... ")
signal.signal(signal.SIGTERM, self._handle_sigterm)
if hasattr(signal, 'SIGHUP'):
signal.signal(signal.SIGHUP, self._handle_sighup)
br_names = [br.br_name for br in self.phys_brs.values()]
self.ovs.ovsdb.idl_monitor.start_bridge_monitor(br_names)
bridge_names = polling.filter_bridge_names([self.int_br.br_name])
with polling.get_polling_manager(
self.minimize_polling,
self.ovsdb_monitor_respawn_interval,
bridge_names=bridge_names,
ovs=self.ovs) as pm:
self.rpc_loop(polling_manager=pm)
它先给_handle_sigterm(…)发了个信号,让它检查是否RPC超时,然后启动网桥监控,最后开始rpc_loop(…)。
rpc_loop(self, polling_manager):
def rpc_loop(self, polling_manager):
idl_monitor = self.ovs.ovsdb.idl_monitor
sync = False
ports = set()
updated_ports_copy = set()
activated_bindings_copy = set()
ancillary_ports = set()
tunnel_sync = True
ovs_restarted = False
consecutive_resyncs = 0
need_clean_stale_flow = True
ports_not_ready_yet = set()
failed_devices = {'added': set(), 'removed': set()}
failed_ancillary_devices = {'added': set(), 'removed': set()}
failed_devices_retries_map = {}
while self._check_and_handle_signal():
#在rpc_loop中,只要没有收到停止信号(while self._check_and_handle_signal())
if self.fullsync:
LOG.info("rpc_loop doing a full sync.")
sync = True
self.fullsync = False
port_info = {}
ancillary_port_info = {}
start = time.time()
LOG.info("Agent rpc_loop - iteration:%d started",
self.iter_num)
self.ovs_status = self.check_ovs_status()
bridges_recreated = False
if self.ovs_status == constants.OVS_RESTARTED:
self._handle_ovs_restart(polling_manager)
tunnel_sync = self.enable_tunneling or tunnel_sync
如果ovs已经离线,Agent不会提供任何服务,以防意料之外的失效或宕机。所以要定期检查ovs的状态。
elif self.ovs_status == constants.OVS_DEAD:
port_stats = self.get_port_stats({}, {})
self.loop_count_and_wait(start, port_stats)
continue
检查是否有物理网桥最近没有被创建,以防在openvswitch重启时没有用到它:
else:
added_bridges = idl_monitor.bridges_added + self.added_bridges
bridges_recreated = self._reconfigure_physical_bridges(
added_bridges)
if bridges_recreated:
# 以防某些网桥被重复创建,我们需要确保网桥中没有稳定的流
need_clean_stale_flow = True
sync |= bridges_recreated
告知tunnel IP插件:
if self.enable_tunneling and tunnel_sync:
try:
tunnel_sync = self.tunnel_sync()
except Exception:
LOG.exception("Error while configuring tunnel endpoints")
tunnel_sync = True
ovs_restarted |= (self.ovs_status == constants.OVS_RESTARTED)
devices_need_retry = (any(failed_devices.values()) or
any(failed_ancillary_devices.values()) or
ports_not_ready_yet)
if (self._agent_has_updates(polling_manager) or sync or
devices_need_retry):
try:
LOG.info("Agent rpc_loop - iteration:%(iter_num)d - "
"starting polling. Elapsed:%(elapsed).3f",
{'iter_num': self.iter_num,
'elapsed': time.time() - start})
if self.conf.AGENT.baremetal_smartnic:
if sync:
self.process_smartnic_ports()
updated_smartnic_ports_copy = (
self.updated_smartnic_ports)
self.updated_smartnic_ports = list()
for port_data in updated_smartnic_ports_copy:
self.treat_smartni
如果需要重新同步,则保存更新后的端口字典来完成回退,并且清空self.updated_ports。
updated_ports_copy = self.updated_ports
self.updated_ports = set()
activated_bindings_copy = self.activated_bindings
self.activated_bindings = set()
(port_info, ancillary_port_info, consecutive_resyncs,
ports_not_ready_yet) = (self.process_port_info(
start, polling_manager, sync, ovs_restarted,
ports, ancillary_ports, updated_ports_copy,
consecutive_resyncs, ports_not_ready_yet,
failed_devices, failed_ancillary_devices))
sync = False
self.process_deleted_ports(port_info)
self.process_deactivated_bindings(port_info)
self.process_activated_bindings(port_info,
activated_bindings_copy)
ofport_changed_ports = self.update_stale_ofport_rules()
if ofport_changed_ports:
port_info.setdefault('updated', set()).update(
ofport_changed_ports)
LOG.info("Agent rpc_loop - iteration:%(iter_num)d - "
"port information retrieved. "
"Elapsed:%(elapsed).3f",
{'iter_num': self.iter_num,
'elapsed': time.time() - start})
保护并连线/取消连线 VIF 并在Neutron server上更新其状态:
if (self._port_info_has_changes(port_info) or
self.sg_agent.firewall_refresh_needed() or
ovs_restarted):
LOG.debug("Starting to process devices in:%s",
port_info)
provisioning_needed = (
ovs_restarted or bridges_recreated)
failed_devices = self.process_network_ports(
port_info, provisioning_needed)
LOG.info("Agent rpc_loop - iteration:%(iter_num)d - "
"ports processed. Elapsed:%(elapsed).3f",
{'iter_num': self.iter_num,
'elapsed': time.time() - start})
if need_clean_stale_flow:
self.cleanup_stale_flows()
need_clean_stale_flow = False
LOG.info("Agent rpc_loop - iteration:%(iter_num)d - "
"cleanup stale flows. Elapsed:%(elapsed).3f",
{'iter_num': self.iter_num,
'elapsed': time.time() - start})
ports = port_info['current']
if self.ancillary_brs:
failed_ancillary_devices = (
self.process_ancillary_network_ports(
ancillary_port_info))
LOG.info("Agent rpc_loop - iteration: "
"%(iter_num)d - ancillary ports "
"processed. Elapsed:%(elapsed).3f",
{'iter_num': self.iter_num,
'elapsed': time.time() - start})
ancillary_ports = ancillary_port_info['current']
polling_manager.polling_completed()
failed_devices_retries_map = (
self.update_retries_map_and_remove_devs_not_to_retry(
failed_devices, failed_ancillary_devices,
failed_devices_ret
ovs_restarted = False
self._dispose_local_vlan_hints()
“使ovs_restarted = False作为try代码块的最后一行,以保证不会有其它异常发生”
但可以看到它的源码中这并不是最后一行orz,后面还有一个self._dispose_local_vlan_hints(),但好像不会发生异常。
except Exception:
LOG.exception("Error while processing VIF ports")
# Put the ports back in self.updated_port
self.updated_ports |= updated_ports_copy
self.activated_bindings |= activated_bindings_copy
sync = True
port_stats = self.get_port_stats(port_info, ancillary_port_info)
self.loop_count_and_wait(start, port_stats)
最后做一些异常处理。