crontab定时任务不利于平时的监控,决定使用一种新的调度框架

1.安装依赖
# 避免连接密码以明文形式存储
pip3 install cryptography
pip3 install paramiko

#  AttributeError: module 'enum' has no attribute 'IntFlag'
pip3 uninstall enum34

pip3 install celery
pip3 install redis
pip3 install dask
yum install mysql-devel
pip3 install mysqlclient
pip3 install apache-airflow
# 避免产生大量日志
cd /usr/local/lib/python3.7/site-packages/airflow
vim settings.py
# LOGGING_LEVEL = logging.INFO
LOGGING_LEVEL = logging.WARN
2.配置环境变量
# vim /etc/profile
# 指定airflow工作目录,airflow的工作目录默认在当前用户目录下
export AIRFLOW_HOME=/usr/local/airflow
 
# source /etc/profile
3.第一次初始化airflow
airflow initdb

# 查看其生成文件
cd /usr/local/airflow
ls
airflow.cfg  airflow.db  logs  unittests.cfg
4.配置MySQL数据库(创建airflow数据库,并赋予 airflow用户访问该数据库的权限)
CREATE DATABASE airflow;

# 注意这里如果密码不是设置为airflow,需要修改airflow.cfg里面的broker_url,result_backend
grant all PRIVILEGES on airflow.* to airflow@'localhost' identified by 'airflow_123';

FLUSH PRIVILEGES;
5.更改MySQL配置
# 更改MySQL配置
Exception: Global variable explicit_defaults_for_timestamp needs to be on (1) for mysql

vim /etc/my.cnf
[mysqld]
explicit_defaults_for_timestamp=1

# 重启mysql
service mysqld restart
6.修改airflow配置
vim airflow/airflow.cfg
# 修改默认时区,不然web界面显示的时间不是国内的时间
# default_timezone = utc
default_timezone = Asia/Shanghai


# 修改执行器类型
executor = CeleryExecutor

# 不加载范例dag
load_example = False

# 不让同个dag并行操作,在ETL过程中,还是线性执行会比较好控制,如果里面需要批量操作,可以在ETL的具体处理过程中加入多线程或者多进程方式执行,不要在dag中体现
max_active_runs_per_dag = 1

# 最高的dag并发数量,一般配置成服务器的CPU核数,默认16也没问题
dag_concurrency = 16

# 最高的任务并发数量,CeleryExecutor在Airflow的worker线程中执行的,这里配置的是启动多少个worker
worker_concurrency = 16

# 数据库配置,我们一般是用MySQL来配合Airflow的运行
# 这里之前使用pymysql导致scheduler 无法在后台运行
sql_alchemy_conn = mysql://airflow:airflow_123@127.0.0.1:3306/airflow?charset=utf8

# 我的redis密码也是airflow_123
# Celery Broker 默认配置中两个redis配置被分到两个redis区
broker_url = redis://:airflow_123@127.0.0.1:6379/0

# Celery Result backend 默认配置中两个redis配置被分到两个redis区
result_backend = redis://:airflow_123@127.0.0.1:6379/1
7.修改为中国时间
cd /usr/local/lib/python3.7/site-packages/airflow
cd utils/
vim timezone.py
# 第一处修改
# 在 utc = pendulum.timezone(‘UTC’) 这行(第27行)代码下添加
from airflow import configuration as conf
try:
	tz = conf.get("core", "default_timezone")
	if tz == "system":
		utc = pendulum.local_timezone()
	else:
		utc = pendulum.timezone(tz)
except Exception:
	pass

# 第二处修改
def utcnow():
    """
    Get the current date and time in UTC
    :return:
    """

    # pendulum utcnow() is not used as that sets a TimezoneInfo object
    # instead of a Timezone. This is not pickable and also creates issues
    # when using replace()
    #d = dt.datetime.utcnow()
    # 修改为:
    d = dt.datetime.now()
    d = d.replace(tzinfo=utc)
    return d
    
# 第三处修改
vim sqlalchemy.py
# 在utc = pendulum.timezone(‘UTC’) 这行(第37行)代码下添加
from airflow import configuration as conf
try:
	tz = conf.get("core", "default_timezone")
	if tz == "system":
		utc = pendulum.local_timezone()
	else:
		utc = pendulum.timezone(tz)
except Exception:
	pass
	
# 第四处修改
# 注释airflow/utils/sqlalchemy.py中的 (第124行)
# cursor.execute(“SET time_zone = ‘+00:00’”)

# 第五处修改
# 修改airflow/www/templates/admin/master.html(第31行)
# var UTCseconds = (x.getTime() + x.getTimezoneOffset()*60*1000); 
var UTCseconds = x.getTime();

# 第六处修改
#"timeFormat":"H:i:s %UTC%",
"timeFormat":"H:i:s",
8.再次初始化airflow
airflow initdb
9.运行 由于使用的是CeleryExecutor,需要顺序执行三个进程
airflow webserver -D
airflow scheduler -D
airflow worker -D
10.验证

airflow 容器_airflow 容器

10.1 The scheduler does not appear to be running. Last heartbeat was received 45 seconds ago.
The DAGs list may not update, and new tasks will not be scheduled.

解决方法:
scheduler 没有起来,查看一下scheduler的日志

10.2 AttributeError: module ‘enum’ has no attribute ‘IntFlag’

pip uninstall enum34

10.3 界面显示,安装成功

airflow 容器_airflow 容器_02