Runner 提供了一个任务执行的抽象定义,同时
BaseRunner参考类图
可以看出BaseRunner
task 使用
主要是基于提供的一个方法标记使用的runner 实现
def get_runner_type(self, node):
raise NotImplementedError("Not Implemented")
目前实现的几个任务类
可以看出基本都是GraphRunnableTask 子类相关的任务
使用runner 的地方
上边也说了核心是GraphRunnableTask 类相关的,因为任务的执行都是在run 方法中,所以实际的处理也是在run 中
目前的处理是当我们需要执行的任务队列参数不为空就会基于runner 执行
GraphRunnableTask 初始化
self._raise_next_tick: Optional[DbtRuntimeError] = None
self._skipped_children: Dict[str, Optional[RunResult]] = {}
self.job_queue: Optional[GraphQueue] = None
任务判断
def run_queue(self, pool):
"""Given a pool, submit jobs from the queue to the pool."""
if self.job_queue is None:
raise DbtInternalError("Got to run_queue with no job queue set")
def callback(result):
"""Note: mark_done, at a minimum, must happen here or dbt will
deadlock during ephemeral result error handling!
"""
self._handle_result(result)
if self.job_queue is None:
raise DbtInternalError("Got to run_queue callback with no job queue set")
self.job_queue.mark_done(result.node.unique_id)
while not self.job_queue.empty():
self.handle_job_queue(pool, callback)
# block on completion
if get_flags().FAIL_FAST:
# checkout for an errors after task completion in case of
# fast failure
while self.job_queue.wait_until_something_was_done():
self._raise_set_error()
else:
# wait until every task will be complete
self.job_queue.join()
# if an error got set during join(), raise it.
self._raise_set_error()
return
execute_nodes 进行
def execute_nodes(self):
num_threads = self.config.threads
target_name = self.config.target_name
# following line can be removed when legacy logger is removed
with NodeCount(self.num_nodes):
fire_event(
ConcurrencyLine(
num_threads=num_threads, target_name=target_name, node_count=self.num_nodes
)
)
with TextOnly():
fire_event(Formatting(""))
pool = ThreadPool(num_threads, self._pool_thread_initializer, [get_invocation_context()])
try:
self.run_queue(pool)
except FailFastError as failure:
self._cancel_connections(pool)
executed_node_ids = [r.node.unique_id for r in self.node_results]
for r in self._flattened_nodes:
if r.unique_id not in executed_node_ids:
self.node_results.append(
RunResult.from_node(r, RunStatus.Skipped, "Skipping due to fail_fast")
)
print_run_result_error(failure.result)
# ensure information about all nodes is propagated to run results when failing fast
return self.node_results
except (KeyboardInterrupt, SystemExit):
run_result = self.get_result(
results=self.node_results,
elapsed_time=time.time() - self.started_at,
generated_at=datetime.utcnow(),
)
if self.args.write_json and hasattr(run_result, "write"):
run_result.write(self.result_path())
self._cancel_connections(pool)
print_run_end_messages(self.node_results, keyboard_interrupt=True)
raise
pool.close()
pool.join()
return self.node_results
目前execute_nodes的处理主要是运行hooks 的地方以及测试执行的地方,比如实际run 方法在hooks 部分会调用
else:
with TextOnly():
fire_event(Formatting(""))
selected_uids = frozenset(n.unique_id for n in self._flattened_nodes)
# 对于选择多个的hooks 模式使用了runner 执行
result = self.execute_with_hooks(selected_uids)
"""Given a pool, submit jobs from the queue to the pool."""
参考实现子类
基本每个任务都会有自己的runner 实现
BaseRunner
┣ CloneRunner
┣ CompileRunner
┃ ┣ GenericSqlRunner
┃ ┃ ┣ SqlCompileRunner
┃ ┃ ┗ SqlExecuteRunner
┃ ┣ ModelRunner
┃ ┃ ┣ SeedRunner
┃ ┃ ┗ SnapshotRunner
┃ ┣ ShowRunner
┃ ┗ TestRunner
┣ FreshnessRunner
┗ SavedQueryRunner
┣ CloneRunner
clone 的runner 实现
- clone runner 提供的功能
核心是实现了execute 方法里边实现了具体的clone 业务操作
class CloneRunner(BaseRunner):
def before_execute(self):
pass
def after_execute(self, result):
pass
def _build_run_model_result(self, model, context):
result = context["load_result"]("main")
if result:
status = RunStatus.Success
message = str(result.response)
else:
status = RunStatus.Success
message = "No-op"
adapter_response = {}
if result and isinstance(result.response, dbtClassMixin):
adapter_response = result.response.to_dict(omit_none=True)
return RunResult(
node=model,
status=status,
timing=[],
thread_id=threading.current_thread().name,
execution_time=0,
message=message,
adapter_response=adapter_response,
failures=None,
)
def compile(self, manifest):
# no-op
return self.node
def _materialization_relations(self, result: Any, model) -> List[BaseRelation]:
if isinstance(result, str):
msg = (
'The materialization ("{}") did not explicitly return a '
"list of relations to add to the cache.".format(str(model.get_materialization()))
)
raise CompilationError(msg, node=model)
if isinstance(result, dict):
return _validate_materialization_relations_dict(result, model)
msg = (
"Invalid return value from materialization, expected a dict "
'with key "relations", got: {}'.format(str(result))
)
raise CompilationError(msg, node=model)
def execute(self, model, manifest):
context = generate_runtime_model_context(model, self.config, manifest)
materialization_macro = manifest.find_materialization_macro_by_name(
self.config.project_name, "clone", self.adapter.type()
)
if "config" not in context:
raise DbtInternalError(
"Invalid materialization context generated, missing config: {}".format(context)
)
context_config = context["config"]
hook_ctx = self.adapter.pre_model_hook(context_config)
try:
result = MacroGenerator(materialization_macro, context)()
finally:
self.adapter.post_model_hook(context_config, hook_ctx)
for relation in self._materialization_relations(result, model):
self.adapter.cache_added(relation.incorporate(dbt_created=True))
return self._build_run_model_result(model, context)
def before_execute(self):
说明
dbt 的runner 实现了任务pool 的处理,可以提升task的并行执行,目前GraphRunnableTask下的任务基本都会有一个runner
参考资料
core/dbt/task/base.py
core/dbt/task/runnable.py
core/dbt/graph/queue.py
https://docs.getdbt.com/best-practices/clone-incremental-models