在管理 AWS 上的 EC2 实例时,监控系统资源的利用率是至关重要的。AWS 提供了丰富的监控工具,其中包括 CloudWatch,它可以帮助您实时监控和管理各种资源的性能。
监控CPU使用率
import boto3
# Create CloudWatch client
cloudwatch = boto3.client('cloudwatch')
# Define EC2 instances to monitor
ec2_instances = [
'i-xxxxxxxxx1',
'i-xxxxxxxxx2',
'i-xxxxxxxxx3'
]
# Loop through EC2 instances and create alarms
for ec2_instance in ec2_instances:
# Define alarm name
alarm_name = 'ec2_CPUUtilization_%s' % ec2_instance
# Create alarm
response = cloudwatch.put_metric_alarm(
AlarmName=alarm_name,
OKActions=["arn:aws:sns:us-east-1:820700710557:ops-alarm"],
AlarmActions=["arn:aws:sns:us-east-1:820700710557:ops-alarm"],
ComparisonOperator='GreaterThanThreshold',
DatapointsToAlarm=3,
EvaluationPeriods=5,
MetricName='CPUUtilization',
Namespace='AWS/EC2',
Period=60,
Statistic='Average',
Threshold=85.0,
ActionsEnabled=False,
TreatMissingData="notBreaching",
AlarmDescription='Alarm when server CPU exceeds 85%',
Dimensions=[
{
'Name': 'InstanceId',
'Value': ec2_instance
},
],
Unit='Percent' # Corrected unit from 'Seconds' to 'Percent'
)
# Check if alarm was successfully created
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
print(f"Successfully created alarm '{alarm_name}' for EC2 instance '{ec2_instance}'.")
else:
print(f"Failed to create alarm for EC2 instance '{ec2_instance}'.")
安装 CloudWatch Agent
1. 安装云监控代理
使用以下命令安装云监控代理:
sudo yum install amazon-cloudwatch-agent -y
2. 配置 IAM 角色权限
确保您的 EC2 实例具有适当的 IAM 角色权限。您可以使用以下 IAM 策略:
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "VisualEditor0",
"Effect": "Allow",
"Action": [
"logs:PutLogEvents",
"cloudwatch:PutMetricData"
],
"Resource": "*"
}
]
}
3. 下载并配置代理
下载云监控代理配置文件并将其保存在 /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json
,然后运行以下命令配置代理:
sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json
4. 启动 CloudWatch Agent 服务
使用以下命令启动 CloudWatch Agent 服务:
systemctl restart amazon-cloudwatch-agent.service
配置 CloudWatch Agent
编辑 CloudWatch Agent 配置文件 /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json
,将以下内容写入配置文件:
{
"agent": {
"run_as_user": "root"
},
"metrics": {
"metrics_collected": {
"mem": {
"measurement": [
"mem_used_percent"
],
"metrics_collection_interval": 180
},
"cpu": {
"measurement": [
"cpu_usage_idle",
"cpu_usage_iowait",
"cpu_usage_user",
"cpu_usage_system"
],
"metrics_collection_interval": 180,
"totalcpu": false
},
"diskio": {
"measurement": [
"io_time",
"write_bytes",
"read_bytes",
"writes",
"reads"
],
"metrics_collection_interval": 180,
"resources": [
"*"
]
},
"netstat": {
"measurement": [
"tcp_established",
"tcp_time_wait"
],
"metrics_collection_interval": 180
},
"statsd": {
"metrics_aggregation_interval": 60,
"metrics_collection_interval": 180,
"service_address": ":8125"
},
"swap": {
"measurement": [
"swap_used_percent"
],
"metrics_collection_interval": 180
},
"disk": {
"resources": [
"/",
"/tmp"
],
"measurement": [
"disk_used_percent"
],
"ignore_file_system_types": [
"sysfs", "devtmpfs"
],
"metrics_collection_interval": 60
}
},
"force_flush_interval": 60
}
}
查看 CloudWatch Agent 状态
运行以下命令查看 CloudWatch Agent 的状态:
sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -m ec2 -a status
以上是完整的 CloudWatch Agent 安装和配置指南。请按照步骤进行操作,并确保代理已成功配置并运行。
监控磁盘利用率
import boto3
# Create CloudWatch client
cloudwatch = boto3.client('cloudwatch')
# Define a list of host names
host_name_list = ['ip-172-31-71-29.ec2.internal', 'ip-172-31-71-30.ec2.internal', 'ip-172-31-71-31.ec2.internal']
# Define dimensions using the list of host names
dimensions_list = []
for host_name in host_name_list:
dimensions = [
{
'Name': 'path',
'Value': '/'
},
{
'Name': 'host',
'Value': host_name
},
{
'Name': 'device',
'Value': 'nvme0n1p2'
},
{
'Name': 'fstype',
'Value': 'xfs'
}
]
dimensions_list.append(dimensions)
# Define function to create CloudWatch alarm
def create_disk_alarm():
for dimensions in dimensions_list:
response = cloudwatch.put_metric_alarm(
AlarmName=f'{dimensions[1]["Value"]}_disk_utilization_alarm',
AlarmDescription='EC2 Disk Utilization Alarm',
ActionsEnabled=True,
OKActions=["arn:aws:sns:us-east-1:820700710557:ops-alarm"],
AlarmActions=["arn:aws:sns:us-east-1:820700710557:ops-alarm"],
MetricName='disk_used_percent',
Namespace='CWAgent',
Statistic='Average',
Dimensions=dimensions,
Period=300,
EvaluationPeriods=1,
DatapointsToAlarm=1,
Threshold=85.0,
ComparisonOperator='GreaterThanThreshold',
TreatMissingData='notBreaching'
)
# Check if alarm creation was successful
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
print(f"Disk utilization alarm for {dimensions[1]['Value']} created successfully.")
else:
print(f"Failed to create disk utilization alarm for {dimensions[1]['Value']}.")
# Create the alarms
create_disk_alarm()
监控内存利用率
import boto3
# 创建 CloudWatch 客户端
cloudwatch = boto3.client('cloudwatch')
# 定义主机名列表
host_name_list = ['ip-172-31-71-29.ec2.internal', 'ip-172-31-71-30.ec2.internal', 'ip-172-31-71-31.ec2.internal']
# 定义函数以创建 CloudWatch 告警
def create_memory_alarms():
for host_name in host_name_list:
response = cloudwatch.put_metric_alarm(
AlarmName=f'ec2_{host_name}_mem',
ActionsEnabled=True,
OKActions=["arn:aws:sns:us-east-1:820700710557:ops-alarm"],
AlarmActions=["arn:aws:sns:us-east-1:820700710557:ops-alarm"],
InsufficientDataActions=[],
MetricName='mem_used_percent',
Namespace='CWAgent',
Statistic='Average',
Dimensions=[
{
'Name': 'host',
'Value': host_name
}
],
Period=300,
EvaluationPeriods=1,
DatapointsToAlarm=1,
Threshold=85.0,
ComparisonOperator='GreaterThanThreshold',
TreatMissingData='notBreaching'
)
# 检查告警创建是否成功
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
print(f"{host_name} 内存利用率告警创建成功。")
else:
print(f"无法创建 {host_name} 内存利用率告警。")
# 创建告警
create_memory_alarms()