一、基本介绍
在前面基本把InnoDB引擎的相关内存数据结构分析说明完成了。那么,一个重要的问题来了,这些内存的数据结构有什么作用,用在哪儿?其实就是一个从设计到应用的问题。在学习源码的过程中,往往会有这么一种现象,就是单纯的学习一些源码的应用,或者说一些使用的技巧。稍微用心的可能看一个这些模块间是如何设计的,有什么可借鉴之处。
其实,在学习源码的过程中,要反复想一个问题,源码为什么是这样?设计是是如何考虑的。也就是说,设计者一定是从设计出发来开发源码,而学习者恰恰想反,是从源码来学习源码,这句话有点绕。其实它的核心意思就是,学习源码尽量要复原源码开发者的设计思想和设计理念,而不是单纯的学习源码的流程和技巧。
当然这也带来一个重要问题,大多数的源码和其实际的应用领域或者说业务是强相关的,这也导致开发者极其不愿意接手别人的代码的原因。毕竟,每个人对业务的理解和设计是不同的。而学习开源的优秀源码,一般来说,和业务关系较少,即使有一些业务也是通用业务,比如数据库、通信、图像分析等等。
下面就把前面文章提到的数据结构的应用情况分析一下,这样能更方便的理清设计的思路,有助于理解InnoDB引擎中内存数据管理设计的理念和思想。
二、Buffer Pool
做为数据库中,什么最重要,当然是数据。数据的重要性体现在哪里?一个是安全性,一个就是速度。有了安全性就可以保证数据是可信的,可使用的;有了速度,后面的大数据存储,事务啥的也就都出来了。Buffer Pool是什么?其实就是解决速度问题的一个手段。解决速度的问题有很多,但为人最熟知和应用最多的就是缓存。没错,缓存可以用在任何地方,只要有需要。缓存是现在计算机解决速度的一个最常用的方法,从实际应用来看,效果也是最好的。
在MySql的整体设计中,有两大块即服务层和引擎层和数据库本身紧密绑定(连接层和存储层都是功能指向明了的)。Buffer Pool是后者中的数据缓存。服务层一般用来处理和客户端的通信和相关的预分析等。而存储引擎则是真正体现数据库的特点的地方。引擎又和存储层紧密结合,把缓存中的数据即时的刷到硬盘存储中去。
看一下相关的代码:
首先要创建一个Buffer Pool的实例:
static void buf_pool_create(buf_pool_t *buf_pool, ulint buf_pool_size,
ulint instance_no, std::mutex *mutex,
dberr_t &err) {
ulint i;
ulint chunk_size;
buf_chunk_t *chunk;
#ifdef UNIV_LINUX
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
const long n_cores = sysconf(_SC_NPROCESSORS_ONLN);
CPU_SET(instance_no % n_cores, &cpuset);
buf_pool->stat.reset();
if (pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset) == -1) {
ib::error(ER_IB_ERR_SCHED_SETAFFNINITY_FAILED)
<< "sched_setaffinity() failed!";
}
/* Linux might be able to set different setting for each thread
worth to try to set high priority for this thread. */
setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid), -20);
#endif /* UNIV_LINUX */
ut_ad(buf_pool_size % srv_buf_pool_chunk_unit == 0);
/* 1. Initialize general fields
------------------------------- */
mutex_create(LATCH_ID_BUF_POOL_CHUNKS, &buf_pool->chunks_mutex);
mutex_create(LATCH_ID_BUF_POOL_LRU_LIST, &buf_pool->LRU_list_mutex);
mutex_create(LATCH_ID_BUF_POOL_FREE_LIST, &buf_pool->free_list_mutex);
mutex_create(LATCH_ID_BUF_POOL_ZIP_FREE, &buf_pool->zip_free_mutex);
mutex_create(LATCH_ID_BUF_POOL_ZIP_HASH, &buf_pool->zip_hash_mutex);
mutex_create(LATCH_ID_BUF_POOL_ZIP, &buf_pool->zip_mutex);
mutex_create(LATCH_ID_BUF_POOL_FLUSH_STATE, &buf_pool->flush_state_mutex);
new (&buf_pool->allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool);
if (buf_pool_size > 0) {
mutex_enter(&buf_pool->chunks_mutex);
buf_pool->n_chunks = buf_pool_size / srv_buf_pool_chunk_unit;
chunk_size = srv_buf_pool_chunk_unit;
buf_pool->chunks = reinterpret_cast<buf_chunk_t *>(
ut_zalloc_nokey(buf_pool->n_chunks * sizeof(*chunk)));
buf_pool->chunks_old = nullptr;
UT_LIST_INIT(buf_pool->LRU, &buf_page_t::LRU);
UT_LIST_INIT(buf_pool->free, &buf_page_t::list);
UT_LIST_INIT(buf_pool->withdraw, &buf_page_t::list);
buf_pool->withdraw_target = 0;
UT_LIST_INIT(buf_pool->flush_list, &buf_page_t::list);
UT_LIST_INIT(buf_pool->unzip_LRU, &buf_block_t::unzip_LRU);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
UT_LIST_INIT(buf_pool->zip_clean, &buf_page_t::list);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
for (i = 0; i < UT_ARR_SIZE(buf_pool->zip_free); ++i) {
UT_LIST_INIT(buf_pool->zip_free[i], &buf_buddy_free_t::list);
}
buf_pool->curr_size = 0;
chunk = buf_pool->chunks;
do {
if (!buf_chunk_init(buf_pool, chunk, chunk_size, mutex)) {
while (--chunk >= buf_pool->chunks) {
buf_block_t *block = chunk->blocks;
for (i = chunk->size; i--; block++) {
mutex_free(&block->mutex);
rw_lock_free(&block->lock);
ut_d(rw_lock_free(&block->debug_latch));
}
buf_pool->deallocate_chunk(chunk);
}
ut_free(buf_pool->chunks);
buf_pool->chunks = nullptr;
err = DB_ERROR;
mutex_exit(&buf_pool->chunks_mutex);
return;
}
buf_pool->curr_size += chunk->size;
} while (++chunk < buf_pool->chunks + buf_pool->n_chunks);
mutex_exit(&buf_pool->chunks_mutex);
buf_pool->instance_no = instance_no;
buf_pool->read_ahead_area = static_cast<page_no_t>(
ut_min(BUF_READ_AHEAD_PAGES,
ut_2_power_up(buf_pool->curr_size / BUF_READ_AHEAD_PORTION)));
buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
buf_pool->old_size = buf_pool->curr_size;
buf_pool->n_chunks_new = buf_pool->n_chunks;
/* Number of locks protecting page_hash must be a
power of two */
srv_n_page_hash_locks =
static_cast<ulong>(ut_2_power_up(srv_n_page_hash_locks));
ut_a(srv_n_page_hash_locks != 0);
ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
buf_pool->page_hash =
ib_create(2 * buf_pool->curr_size, LATCH_ID_HASH_TABLE_RW_LOCK,
srv_n_page_hash_locks, MEM_HEAP_FOR_PAGE_HASH);
buf_pool->page_hash_old = nullptr;
buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
buf_pool->last_printout_time = ut_time_monotonic();
}
/* 2. Initialize flushing fields
-------------------------------- */
mutex_create(LATCH_ID_FLUSH_LIST, &buf_pool->flush_list_mutex);
for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
buf_pool->no_flush[i] = os_event_create();
}
buf_pool->watch = (buf_page_t *)ut_zalloc_nokey(sizeof(*buf_pool->watch) *
BUF_POOL_WATCH_SIZE);
for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
buf_pool->watch[i].buf_pool_index = buf_pool->instance_no;
}
/* All fields are initialized by ut_zalloc_nokey(). */
buf_pool->try_LRU_scan = TRUE;
/* Dirty Page Tracking is disabled by default. */
buf_pool->track_page_lsn = LSN_MAX;
buf_pool->max_lsn_io = 0;
/* Initialize the hazard pointer for flush_list batches */
new (&buf_pool->flush_hp) FlushHp(buf_pool, &buf_pool->flush_list_mutex);
/* Initialize the hazard pointer for the oldest page scan */
new (&buf_pool->oldest_hp) FlushHp(buf_pool, &buf_pool->flush_list_mutex);
/* Initialize the hazard pointer for LRU batches */
new (&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->LRU_list_mutex);
/* Initialize the iterator for LRU scan search */
new (&buf_pool->lru_scan_itr) LRUItr(buf_pool, &buf_pool->LRU_list_mutex);
/* Initialize the iterator for single page scan search */
new (&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->LRU_list_mutex);
err = DB_SUCCESS;
}
dberr_t buf_pool_init(ulint total_size, ulint n_instances) {
ulint i;
const ulint size = total_size / n_instances;
ut_ad(n_instances > 0);
ut_ad(n_instances <= MAX_BUFFER_POOLS);
ut_ad(n_instances == srv_buf_pool_instances);
NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
/* Usually buf_pool_should_madvise is protected by buf_pool_t::chunk_mutex-es,
but at this point in time there is no buf_pool_t instances yet, and no risk of
race condition with sys_var modifications or buffer pool resizing because we
have just started initializing the buffer pool.*/
buf_pool_should_madvise = innobase_should_madvise_buf_pool();
buf_pool_resizing = false;
buf_pool_ptr =
(buf_pool_t *)ut_zalloc_nokey(n_instances * sizeof *buf_pool_ptr);
buf_chunk_map_reg = UT_NEW_NOKEY(buf_pool_chunk_map_t());
std::vector<dberr_t> errs;
errs.assign(n_instances, DB_SUCCESS);
#ifdef UNIV_LINUX
ulint n_cores = sysconf(_SC_NPROCESSORS_ONLN);
/* Magic nuber 8 is from empirical testing on a
4 socket x 10 Cores x 2 HT host. 128G / 16 instances
takes about 4 secs, compared to 10 secs without this
optimisation.. */
if (n_cores > 8) {
n_cores = 8;
}
#else
ulint n_cores = 4;
#endif /* UNIV_LINUX */
dberr_t err = DB_SUCCESS;
for (i = 0; i < n_instances; /* no op */) {
ulint n = i + n_cores;
if (n > n_instances) {
n = n_instances;
}
std::vector<std::thread> threads;
std::mutex m;
for (ulint id = i; id < n; ++id) {
threads.emplace_back(std::thread(buf_pool_create, &buf_pool_ptr[id], size,
id, &m, std::ref(errs[id])));
}
for (ulint id = i; id < n; ++id) {
threads[id - i].join();
if (errs[id] != DB_SUCCESS) {
err = errs[id];
}
}
if (err != DB_SUCCESS) {
for (size_t id = 0; id < n; ++id) {
if (buf_pool_ptr[id].chunks != nullptr) {
buf_pool_free_instance(&buf_pool_ptr[id]);
}
}
buf_pool_free();
return (err);
}
/* Do the next block of instances */
i = n;
}
buf_pool_set_sizes();
buf_LRU_old_ratio_update(100 * 3 / 8, FALSE);
btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void *) / 64);
buf_stat_per_index =
UT_NEW(buf_stat_per_index_t(), mem_key_buf_stat_per_index_t);
return (DB_SUCCESS);
}
初始化并创建Buffer Pool的缓冲池实例,其中对CHUNK的控制如下:
static buf_chunk_t *buf_chunk_init(
buf_pool_t *buf_pool, /*!< in: buffer pool instance */
buf_chunk_t *chunk, /*!< out: chunk of buffers */
ulonglong mem_size, /*!< in: requested size in bytes */
std::mutex *mutex) /*!< in,out: Mutex protecting chunk map. */
{
buf_block_t *block;
byte *frame;
ulint i;
mutex_own(&buf_pool->chunks_mutex);
/* Round down to a multiple of page size,
although it already should be. */
mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
/* Reserve space for the block descriptors. */
mem_size += ut_2pow_round(
(mem_size / UNIV_PAGE_SIZE) * (sizeof *block) + (UNIV_PAGE_SIZE - 1),
UNIV_PAGE_SIZE);
DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return (nullptr););
if (!buf_pool->allocate_chunk(mem_size, chunk)) {
return (nullptr);
}
#ifdef HAVE_LIBNUMA
if (srv_numa_interleave) {
struct bitmask *numa_nodes = numa_get_mems_allowed();
int st = mbind(chunk->mem, chunk->mem_size(), MPOL_INTERLEAVE,
numa_nodes->maskp, numa_nodes->size, MPOL_MF_MOVE);
if (st != 0) {
ib::warn(ER_IB_MSG_54) << "Failed to set NUMA memory policy of"
" buffer pool page frames to MPOL_INTERLEAVE"
" (error: "
<< strerror(errno) << ").";
}
numa_bitmask_free(numa_nodes);
}
#endif /* HAVE_LIBNUMA */
/* Allocate the block descriptors from
the start of the memory block. */
chunk->blocks = (buf_block_t *)chunk->mem;
/* Align a pointer to the first frame. Note that when
os_large_page_size is smaller than UNIV_PAGE_SIZE,
we may allocate one fewer block than requested. When
it is bigger, we may allocate more blocks than requested. */
frame = (byte *)ut_align(chunk->mem, UNIV_PAGE_SIZE);
chunk->size = chunk->mem_pfx.m_size / UNIV_PAGE_SIZE - (frame != chunk->mem);
/* Subtract the space needed for block descriptors. */
{
ulint size = chunk->size;
while (frame < (byte *)(chunk->blocks + size)) {
frame += UNIV_PAGE_SIZE;
size--;
}
chunk->size = size;
}
/* Init block structs and assign frames for them. Then we
assign the frames to the first blocks (we already mapped the
memory above). */
block = chunk->blocks;
for (i = chunk->size; i--;) {
buf_block_init(buf_pool, block, frame);
UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
/* Add the block to the free list */
UT_LIST_ADD_LAST(buf_pool->free, &block->page);
ut_d(block->page.in_free_list = TRUE);
ut_ad(!block->page.someone_has_io_responsibility());
ut_ad(buf_pool_from_block(block) == buf_pool);
block++;
frame += UNIV_PAGE_SIZE;
}
if (mutex != nullptr) {
mutex->lock();
}
buf_pool_register_chunk(chunk);
if (mutex != nullptr) {
mutex->unlock();
}
#ifdef PFS_GROUP_BUFFER_SYNC
pfs_register_buffer_block(chunk);
#endif /* PFS_GROUP_BUFFER_SYNC */
return (chunk);
}
bool buf_pool_t::allocate_chunk(ulonglong mem_size, buf_chunk_t *chunk) {
ut_ad(mutex_own(&chunks_mutex));
chunk->mem = allocator.allocate_large(mem_size, &chunk->mem_pfx);
if (chunk->mem == nullptr) {
return false;
}
/* Dump core without large memory buffers * /
if (buf_pool_should_madvise) {
if (!chunk->madvise_dont_dump()) {
innobase_disable_core_dump();
}
}
return true;
}
void buf_pool_t::deallocate_chunk(buf_chunk_t *chunk) {
ut_ad(mutex_own(&chunks_mutex));
/* Undo the effect of the earlier MADV_DONTDUMP * /
if (buf_pool_should_madvise) {
if (!chunk->madvise_dump()) {
innobase_disable_core_dump();
}
}
allocator.deallocate_large(chunk->mem, &chunk->mem_pfx);
}
在前面分析过,Buffer Pool是以Chunk来分配的,这就是初始化创建等的基本的函数。一个InnoDB的引擎中有多个缓冲区的实例,其对应的数据结构就是buf_pool_t,而一个实例包含多个Chunk(buf_chunk_t),一个Chunk中包含多个Block(buf_block_t),而Block最终又包含一个PAge(buf_page_t)。从整体上看,它形成了一个树状的链式结构。在buf0buf.cc中还有重分配和重置几个函数,都是对缓冲池很重要的函数,可以去认真分析一下。
对下写入硬盘在buf0flu.cc这个文件中:
static void buf_flush_write_block_low(buf_page_t *bpage, buf_flush_t flush_type,
bool sync) {
page_t *frame = nullptr;
#ifdef UNIV_DEBUG
buf_pool_t *buf_pool = buf_pool_from_bpage(bpage);
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
#endif /* UNIV_DEBUG */
DBUG_PRINT("ib_buf", ("flush %s %u page " UINT32PF ":" UINT32PF,
sync ? "sync" : "async", (unsigned)flush_type,
bpage->id.space(), bpage->id.page_no()));
ut_ad(buf_page_in_file(bpage));
/* We are not holding block_mutex here. Nevertheless, it is safe to
access bpage, because it is io_fixed and oldest_modification != 0.
Thus, it cannot be relocated in the buffer pool or removed from
flush_list or LRU_list. */
ut_ad(!buf_flush_list_mutex_own(buf_pool));
ut_ad(!buf_page_get_mutex(bpage)->is_owned());
ut_ad(bpage->is_io_fix_write());
ut_ad(bpage->is_dirty());
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a(ibuf_count_get(bpage->id) == 0);
#endif /* UNIV_IBUF_COUNT_DEBUG */
ut_ad(recv_recovery_is_on() || bpage->get_newest_lsn() != 0);
/* Force the log to the disk before writing the modified block */
if (!srv_read_only_mode) {
const lsn_t flush_to_lsn = bpage->get_newest_lsn();
/* Do the check before calling log_write_up_to() because in most
cases it would allow to avoid call, and because of that we don't
want those calls because they would have bad impact on the counter
of calls, which is monitored to save CPU on spinning in log threads. */
if (log_sys->flushed_to_disk_lsn.load() < flush_to_lsn) {
Wait_stats wait_stats;
wait_stats = log_write_up_to(*log_sys, flush_to_lsn, true);
MONITOR_INC_WAIT_STATS_EX(MONITOR_ON_LOG_, _PAGE_WRITTEN, wait_stats);
}
}
DBUG_EXECUTE_IF("log_first_rec_group_test", {
recv_no_ibuf_operations = false;
const lsn_t end_lsn = mtr_commit_mlog_test(*log_sys);
log_write_up_to(*log_sys, end_lsn, true);
DBUG_SUICIDE();
});
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
case BUF_BLOCK_MEMORY:
case BUF_BLOCK_REMOVE_HASH:
ut_error;
break;
case BUF_BLOCK_ZIP_DIRTY: {
frame = bpage->zip.data;
BlockReporter reporter =
BlockReporter(false, frame, bpage->size,
fsp_is_checksum_disabled(bpage->id.space()));
mach_write_to_8(frame + FIL_PAGE_LSN, bpage->get_newest_lsn());
ut_a(reporter.verify_zip_checksum());
break;
}
case BUF_BLOCK_FILE_PAGE:
frame = bpage->zip.data;
if (!frame) {
frame = ((buf_block_t *)bpage)->frame;
}
buf_flush_init_for_writing(
reinterpret_cast<const buf_block_t *>(bpage),
reinterpret_cast<const buf_block_t *>(bpage)->frame,
bpage->zip.data ? &bpage->zip : nullptr, bpage->get_newest_lsn(),
fsp_is_checksum_disabled(bpage->id.space()),
false /* do not skip lsn check */);
break;
}
dberr_t err = dblwr::write(flush_type, bpage, sync);
ut_a(err == DB_SUCCESS || err == DB_TABLESPACE_DELETED);
/* Increment the counter of I/O operations used
for selecting LRU policy. */
buf_LRU_stat_inc_io();
}
/** Writes a flushable page asynchronously from the buffer pool to a file.
NOTE: 1. in simulated aio we must call os_aio_simulated_wake_handler_threads
after we have posted a batch of writes! 2. buf_page_get_mutex(bpage) must be
held upon entering this function. The LRU list mutex must be held if flush_type
== BUF_FLUSH_SINGLE_PAGE. Both mutexes will be released by this function if it
returns true.
@param[in] buf_pool buffer pool instance
@param[in] bpage buffer control block
@param[in] flush_type type of flush
@param[in] sync true if sync IO request
@return true if page was flushed */
ibool buf_flush_page(buf_pool_t *buf_pool, buf_page_t *bpage,
buf_flush_t flush_type, bool sync) {
BPageMutex *block_mutex;
ut_ad(flush_type < BUF_FLUSH_N_TYPES);
/* Hold the LRU list mutex iff called for a single page LRU
flush. A single page LRU flush is already non-performant, and holding
the LRU list mutex allows us to avoid having to store the previous LRU
list page or to restart the LRU scan in
buf_flush_single_page_from_LRU(). */
ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE ||
!mutex_own(&buf_pool->LRU_list_mutex));
ut_ad(flush_type != BUF_FLUSH_SINGLE_PAGE ||
mutex_own(&buf_pool->LRU_list_mutex));
ut_ad(buf_page_in_file(bpage));
ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
block_mutex = buf_page_get_mutex(bpage);
ut_ad(mutex_own(block_mutex));
ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
bool is_uncompressed;
is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
ibool flush;
rw_lock_t *rw_lock = nullptr;
bool no_fix_count = bpage->buf_fix_count == 0;
if (!is_uncompressed) {
flush = TRUE;
rw_lock = nullptr;
} else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST) ||
(!no_fix_count &&
srv_shutdown_state.load() < SRV_SHUTDOWN_FLUSH_PHASE &&
fsp_is_system_temporary(bpage->id.space()))) {
/* This is a heuristic, to avoid expensive SX attempts. */
/* For table residing in temporary tablespace sync is done
using IO_FIX and so before scheduling for flush ensure that
page is not fixed. */
flush = FALSE;
} else {
rw_lock = &reinterpret_cast<buf_block_t *>(bpage)->lock;
if (flush_type != BUF_FLUSH_LIST) {
flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE);
} else {
/* Will SX lock later */
flush = TRUE;
}
}
if (flush) {
/* We are committed to flushing by the time we get here */
mutex_enter(&buf_pool->flush_state_mutex);
buf_page_set_io_fix(bpage, BUF_IO_WRITE);
buf_page_set_flush_type(bpage, flush_type);
if (buf_pool->n_flush[flush_type] == 0) {
os_event_reset(buf_pool->no_flush[flush_type]);
}
++buf_pool->n_flush[flush_type];
if (bpage->get_oldest_lsn() > buf_pool->max_lsn_io) {
buf_pool->max_lsn_io = bpage->get_oldest_lsn();
}
if (!fsp_is_system_temporary(bpage->id.space()) &&
buf_pool->track_page_lsn != LSN_MAX) {
auto frame = bpage->zip.data;
if (frame == nullptr) {
frame = ((buf_block_t *)bpage)->frame;
}
lsn_t frame_lsn = mach_read_from_8(frame + FIL_PAGE_LSN);
arch_page_sys->track_page(bpage, buf_pool->track_page_lsn, frame_lsn,
false);
}
mutex_exit(&buf_pool->flush_state_mutex);
mutex_exit(block_mutex);
if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
mutex_exit(&buf_pool->LRU_list_mutex);
}
if (flush_type == BUF_FLUSH_LIST && is_uncompressed &&
!rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) {
if (!fsp_is_system_temporary(bpage->id.space()) && dblwr::enabled) {
dblwr::force_flush(flush_type, buf_pool_index(buf_pool));
} else {
buf_flush_sync_datafiles();
}
rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
}
/* If there is an observer that wants to know if the
asynchronous flushing was sent then notify it.
Note: we set flush observer to a page with x-latch, so we can
guarantee that notify_flush and notify_remove are called in pair
with s-latch on a uncompressed page. */
if (bpage->get_flush_observer() != nullptr) {
bpage->get_flush_observer()->notify_flush(buf_pool, bpage);
}
/* Even though bpage is not protected by any mutex at this
point, it is safe to access bpage, because it is io_fixed and
oldest_modification != 0. Thus, it cannot be relocated in the
buffer pool or removed from flush_list or LRU_list. * /
buf_flush_write_block_low(bpage, flush_type, sync);
}
return (flush);
}
这个函数还调用了log_write_up_to()等内个函数,来完成具体的Flush动作。
此处只分析使用的架构流程代码,对相关的数据结构在前面都分析过了。以下的几个类似。
三、Change Buffer
Change Buffer其实和上面的Buffer Pool一样,只不过,为了突出在数据库中对数据的修改而专门从Buffer Pool中抽出的一个独立单元。它更专注于对物理IO的速度的提升。看一下相关应用的代码:
创建ibuf的入口在ibuf0buf.cc中:
ibool ibuf_insert(ibuf_op_t op, const dtuple_t *entry, dict_index_t *index,
const page_id_t &page_id, const page_size_t &page_size,
que_thr_t *thr) {
dberr_t err;
ulint entry_size;
ibool no_counter;
/* Read the settable global variable ibuf_use only once in
this function, so that we will have a consistent view of it. */
assert(innodb_change_buffering <= IBUF_USE_ALL);
ibuf_use_t use = static_cast<ibuf_use_t>(innodb_change_buffering);
DBUG_TRACE;
DBUG_PRINT("ibuf", ("op: %d, space: " UINT32PF ", page_no: " UINT32PF, op,
page_id.space(), page_id.page_no()));
ut_ad(dtuple_check_typed(entry));
ut_ad(!fsp_is_system_temporary(page_id.space()));
ut_a(!index->is_clustered());
no_counter = use <= IBUF_USE_INSERT;
switch (op) {
case IBUF_OP_INSERT:
switch (use) {
case IBUF_USE_NONE:
case IBUF_USE_DELETE:
case IBUF_USE_DELETE_MARK:
return FALSE;
case IBUF_USE_INSERT:
case IBUF_USE_INSERT_DELETE_MARK:
case IBUF_USE_ALL:
goto check_watch;
}
break;
case IBUF_OP_DELETE_MARK:
switch (use) {
case IBUF_USE_NONE:
case IBUF_USE_INSERT:
return FALSE;
case IBUF_USE_DELETE_MARK:
case IBUF_USE_DELETE:
case IBUF_USE_INSERT_DELETE_MARK:
case IBUF_USE_ALL:
ut_ad(!no_counter);
goto check_watch;
}
break;
case IBUF_OP_DELETE:
switch (use) {
case IBUF_USE_NONE:
case IBUF_USE_INSERT:
case IBUF_USE_INSERT_DELETE_MARK:
return FALSE;
case IBUF_USE_DELETE_MARK:
case IBUF_USE_DELETE:
case IBUF_USE_ALL:
ut_ad(!no_counter);
goto skip_watch;
}
break;
case IBUF_OP_COUNT:
break;
}
/* unknown op or use */
ut_error;
check_watch:
/* If a thread attempts to buffer an insert on a page while a
purge is in progress on the same page, the purge must not be
buffered, because it could remove a record that was
re-inserted later. For simplicity, we block the buffering of
all operations on a page that has a purge pending.
We do not check this in the IBUF_OP_DELETE case, because that
would always trigger the buffer pool watch during purge and
thus prevent the buffering of delete operations. We assume
that the issuer of IBUF_OP_DELETE has called
buf_pool_watch_set(space, page_no). */
{
buf_pool_t *buf_pool = buf_pool_get(page_id);
buf_page_t *bpage = buf_page_get_also_watch(buf_pool, page_id);
if (bpage != nullptr) {
/* A buffer pool watch has been set or the
page has been read into the buffer pool.
Do not buffer the request. If a purge operation
is being buffered, have this request executed
directly on the page in the buffer pool after the
buffered entries for this page have been merged. */
return FALSE;
}
}
skip_watch:
entry_size = rec_get_converted_size(index, entry);
if (entry_size >=
page_get_free_space_of_empty(dict_table_is_comp(index->table)) / 2) {
return FALSE;
}
err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter, entry, entry_size,
index, page_id, page_size, thr);
if (err == DB_FAIL) {
err =
ibuf_insert_low(BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT, op, no_counter,
entry, entry_size, index, page_id, page_size, thr);
}
if (err == DB_SUCCESS) {
/*
#if defined(UNIV_IBUF_DEBUG)
fprintf(stderr, "Ibuf insert for page no %lu of index %s\n",
page_no, index->name);
#endif
* /
return TRUE;
} else {
ut_a(err == DB_STRONG_FAIL || err == DB_TOO_BIG_RECORD);
return FALSE;
}
}
这个ibuf_insert函数用来创建ibuf的ibuf entry。此函数会调用ibuf_insert_low来实现具体的功能:
static MY_ATTRIBUTE((warn_unused_result)) dberr_t
ibuf_insert_low(ulint mode, ibuf_op_t op, ibool no_counter,
const dtuple_t *entry, ulint entry_size,
dict_index_t *index, const page_id_t &page_id,
const page_size_t &page_size, que_thr_t *thr) {
big_rec_t *dummy_big_rec;
btr_pcur_t pcur;
btr_cur_t *cursor;
dtuple_t *ibuf_entry;
mem_heap_t *offsets_heap = nullptr;
mem_heap_t *heap;
ulint *offsets = nullptr;
ulint buffered;
lint min_n_recs;
rec_t *ins_rec;
ibool old_bit_value;
page_t *bitmap_page;
buf_block_t *block;
page_t *root;
dberr_t err;
ibool do_merge;
space_id_t space_ids[IBUF_MAX_N_PAGES_MERGED];
page_no_t page_nos[IBUF_MAX_N_PAGES_MERGED];
ulint n_stored = 0;
mtr_t mtr;
mtr_t bitmap_mtr;
ut_a(!index->is_clustered());
ut_ad(!dict_index_is_spatial(index));
ut_ad(dtuple_check_typed(entry));
ut_ad(!no_counter || op == IBUF_OP_INSERT);
ut_a(op < IBUF_OP_COUNT);
do_merge = FALSE;
/* Perform dirty reads of ibuf->size and ibuf->max_size, to
reduce ibuf_mutex contention. Given that ibuf->max_size and
ibuf->size fit in a machine word, this should be OK; at worst
we are doing some excessive ibuf_contract() or occasionally
skipping an ibuf_contract(). */
if (ibuf->max_size == 0) {
return (DB_STRONG_FAIL);
}
if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
/* Insert buffer is now too big, contract it but do not try
to insert */
#ifdef UNIV_IBUF_DEBUG
fputs("Ibuf too big\n", stderr);
#endif
ibuf_contract(true);
return (DB_STRONG_FAIL);
}
heap = mem_heap_create(1024);
/* Build the entry which contains the space id and the page number
as the first fields and the type information for other fields, and
which will be inserted to the insert buffer. Using a counter value
of 0xFFFF we find the last record for (space, page_no), from which
we can then read the counter value N and use N + 1 in the record we
insert. (We patch the ibuf_entry's counter field to the correct
value just before actually inserting the entry.) */
ibuf_entry =
ibuf_entry_build(op, index, entry, page_id.space(), page_id.page_no(),
no_counter ? ULINT_UNDEFINED : 0xFFFF, heap);
/* Open a cursor to the insert buffer tree to calculate if we can add
the new entry to it without exceeding the free space limit for the
page. */
if (BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
for (;;) {
mutex_enter(&ibuf_pessimistic_insert_mutex);
mutex_enter(&ibuf_mutex);
if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) {
break;
}
mutex_exit(&ibuf_mutex);
mutex_exit(&ibuf_pessimistic_insert_mutex);
if (!ibuf_add_free_page()) {
mem_heap_free(heap);
return (DB_STRONG_FAIL);
}
}
}
ibuf_mtr_start(&mtr);
btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
/* Find out the volume of already buffered inserts for the same index
page */
min_n_recs = 0;
buffered = ibuf_get_volume_buffered(
&pcur, page_id.space(), page_id.page_no(),
op == IBUF_OP_DELETE ? &min_n_recs : nullptr, &mtr);
if (op == IBUF_OP_DELETE &&
(min_n_recs < 2 || buf_pool_watch_occurred(page_id))) {
/* The page could become empty after the record is
deleted, or the page has been read in to the buffer
pool. Refuse to buffer the operation. */
/* The buffer pool watch is needed for IBUF_OP_DELETE
because of latching order considerations. We can
check buf_pool_watch_occurred() only after latching
the insert buffer B-tree pages that contain buffered
changes for the page. We never buffer IBUF_OP_DELETE,
unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have
been previously buffered for the page. Because there
are buffered operations for the page, the insert
buffer B-tree page latches held by mtr will guarantee
that no changes for the user page will be merged
before mtr_commit(&mtr). We must not mtr_commit(&mtr)
until after the IBUF_OP_DELETE has been buffered. */
fail_exit:
if (BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
mutex_exit(&ibuf_mutex);
mutex_exit(&ibuf_pessimistic_insert_mutex);
}
err = DB_STRONG_FAIL;
goto func_exit;
}
/* After this point, the page could still be loaded to the
buffer pool, but we do not have to care about it, since we are
holding a latch on the insert buffer leaf page that contains
buffered changes for (space, page_no). If the page enters the
buffer pool, buf_page_io_complete() for (space, page_no) will
have to acquire a latch on the same insert buffer leaf page,
which it cannot do until we have buffered the IBUF_OP_DELETE
and done mtr_commit(&mtr) to release the latch. */
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a((buffered == 0) || ibuf_count_get(page_id));
#endif
ibuf_mtr_start(&bitmap_mtr);
bitmap_page = ibuf_bitmap_get_map_page(page_id, page_size, &bitmap_mtr);
/* We check if the index page is suitable for buffered entries */
if (buf_page_peek(page_id) || lock_rec_expl_exist_on_page(page_id)) {
ibuf_mtr_commit(&bitmap_mtr);
goto fail_exit;
}
if (op == IBUF_OP_INSERT) {
ulint bits = ibuf_bitmap_page_get_bits(bitmap_page, page_id, page_size,
IBUF_BITMAP_FREE, &bitmap_mtr);
if (buffered + entry_size + page_dir_calc_reserved_space(1) >
ibuf_index_page_calc_free_from_bits(page_size, bits)) {
/* Release the bitmap page latch early. */
ibuf_mtr_commit(&bitmap_mtr);
/* It may not fit */
do_merge = TRUE;
ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur), &mtr, space_ids,
page_nos, &n_stored);
goto fail_exit;
}
}
if (!no_counter) {
/* Patch correct counter value to the entry to
insert. This can change the insert position, which can
result in the need to abort in some cases. */
ulint counter = ibuf_get_entry_counter(
page_id.space(), page_id.page_no(), btr_pcur_get_rec(&pcur), &mtr,
btr_pcur_get_btr_cur(&pcur)->low_match < IBUF_REC_FIELD_METADATA);
dfield_t *field;
if (counter == ULINT_UNDEFINED) {
ibuf_mtr_commit(&bitmap_mtr);
goto fail_exit;
}
field = dtuple_get_nth_field(ibuf_entry, IBUF_REC_FIELD_METADATA);
mach_write_to_2((byte *)dfield_get_data(field) + IBUF_REC_OFFSET_COUNTER,
counter);
}
/* Set the bitmap bit denoting that the insert buffer contains
buffered entries for this index page, if the bit is not set yet */
old_bit_value = ibuf_bitmap_page_get_bits(bitmap_page, page_id, page_size,
IBUF_BITMAP_BUFFERED, &bitmap_mtr);
if (!old_bit_value) {
ibuf_bitmap_page_set_bits(bitmap_page, page_id, page_size,
IBUF_BITMAP_BUFFERED, TRUE, &bitmap_mtr);
}
ibuf_mtr_commit(&bitmap_mtr);
cursor = btr_pcur_get_btr_cur(&pcur);
if (mode == BTR_MODIFY_PREV) {
err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor, &offsets,
&offsets_heap, ibuf_entry, &ins_rec,
&dummy_big_rec, thr, &mtr);
block = btr_cur_get_block(cursor);
ut_ad(block->page.id.space() == IBUF_SPACE_ID);
/* If this is the root page, update ibuf->empty. */
if (block->page.id.page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) {
const page_t *root = buf_block_get_frame(block);
ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
ut_ad(page_get_page_no(root) == FSP_IBUF_TREE_ROOT_PAGE_NO);
ibuf->empty = page_is_empty(root);
}
} else {
ut_ad(BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE);
/* We acquire an sx-latch to the root page before the insert,
because a pessimistic insert releases the tree x-latch,
which would cause the sx-latching of the root after that to
break the latching order. */
root = ibuf_tree_root_get(&mtr);
err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
cursor, &offsets, &offsets_heap, ibuf_entry,
&ins_rec, &dummy_big_rec, thr, &mtr);
if (err == DB_FAIL) {
err = btr_cur_pessimistic_insert(
BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, cursor, &offsets,
&offsets_heap, ibuf_entry, &ins_rec, &dummy_big_rec, thr, &mtr);
}
mutex_exit(&ibuf_pessimistic_insert_mutex);
ibuf_size_update(root);
mutex_exit(&ibuf_mutex);
ibuf->empty = page_is_empty(root);
block = btr_cur_get_block(cursor);
ut_ad(block->page.id.space() == IBUF_SPACE_ID);
}
if (offsets_heap) {
mem_heap_free(offsets_heap);
}
if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
/* Update the page max trx id field * /
page_update_max_trx_id(block, nullptr, thr_get_trx(thr)->id, &mtr);
}
func_exit:
#ifdef UNIV_IBUF_COUNT_DEBUG
if (err == DB_SUCCESS) {
ib::info(ER_IB_MSG_607)
<< "Incrementing ibuf count of page " << page_id << " from "
<< ibuf_count_get(space, page_no) << " by 1";
ibuf_count_set(page_id, ibuf_count_get(page_id) + 1);
}
#endif
ibuf_mtr_commit(&mtr);
btr_pcur_close(&pcur);
mem_heap_free(heap);
if (err == DB_SUCCESS &&
BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
ibuf_contract_after_insert(entry_size);
}
if (do_merge) {
#ifdef UNIV_IBUF_DEBUG
ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
#endif
buf_read_ibuf_merge_pages(false, space_ids, page_nos, n_stored);
}
return (err);
}
代码主要在innobase/ibuf0buf.cc中,对代码更详细的阅读可以在这个文件中展开。
四、ADaptive Hash Index
索引哈希其实也是为了提高检索的速度,只是正如前面分析,需要一些预设条件罢了。想一下也确实如此,内存如此宝贵,不能动不动哪儿都用内存,还是要慎重为佳。
在innobase/include/btrsea.ic中可看到使用得到这个表的函数:
hash_table_t *btr_get_search_table(const dict_index_t *index) {
ut_ad(index != nullptr);
ulint ifold = ut_fold_ulint_pair(static_cast<ulint>(index->id),
static_cast<ulint>(index->space));
return (btr_search_sys->hash_tables[ifold % btr_ahi_parts]);
}
在innobae/btr/btr0sea.cc中可以看到相关的创建函数:
/** Creates and initializes the adaptive search system at a database start.
@param[in] hash_size hash table size. */
void btr_search_sys_create(ulint hash_size) {
/* Search System is divided into n parts.
Each part controls access to distinct set of hash buckets from
hash table through its own latch. */
/* Step-1: Allocate latches (1 per part). */
btr_search_latches = reinterpret_cast<rw_lock_t **>(
ut_malloc(sizeof(rw_lock_t *) * btr_ahi_parts, mem_key_ahi));
for (ulint i = 0; i < btr_ahi_parts; ++i) {
btr_search_latches[i] = reinterpret_cast<rw_lock_t *>(
ut_malloc(sizeof(rw_lock_t), mem_key_ahi));
rw_lock_create(btr_search_latch_key, btr_search_latches[i],
SYNC_SEARCH_SYS);
}
/* Step-2: Allocate hash tablees. */
btr_search_sys = reinterpret_cast<btr_search_sys_t *>(
ut_malloc(sizeof(btr_search_sys_t), mem_key_ahi));
btr_search_sys->hash_tables = reinterpret_cast<hash_table_t **>(
ut_malloc(sizeof(hash_table_t *) * btr_ahi_parts, mem_key_ahi));
for (ulint i = 0; i < btr_ahi_parts; ++i) {
btr_search_sys->hash_tables[i] =
ib_create((hash_size / btr_ahi_parts), LATCH_ID_HASH_TABLE_MUTEX, 0,
MEM_HEAP_FOR_BTR_SEARCH);
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
btr_search_sys->hash_tables[i]->adaptive = TRUE;
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
}
}
/** Resize hash index hash table.
@param[in] hash_size hash index hash table size */
void btr_search_sys_resize(ulint hash_size) {
/* Step-1: Lock all search latches in exclusive mode. */
btr_search_x_lock_all();
if (btr_search_enabled) {
btr_search_x_unlock_all();
ib::error(ER_IB_MSG_45) << "btr_search_sys_resize failed because"
" hash index hash table is not empty.";
ut_ad(0);
return;
}
/* Step-2: Recreate hash tables with new size. */
for (ulint i = 0; i < btr_ahi_parts; ++i) {
mem_heap_free(btr_search_sys->hash_tables[i]->heap);
hash_table_free(btr_search_sys->hash_tables[i]);
btr_search_sys->hash_tables[i] =
ib_create((hash_size / btr_ahi_parts), LATCH_ID_HASH_TABLE_MUTEX, 0,
MEM_HEAP_FOR_BTR_SEARCH);
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
btr_search_sys->hash_tables[i]->adaptive = TRUE;
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
}
/* Step-3: Unlock all search latches from exclusive mode. * /
btr_search_x_unlock_all();
}
static void btr_search_build_page_hash_index(dict_index_t *index,
buf_block_t *block, ulint n_fields,
ulint n_bytes, ibool left_side) {
hash_table_t *table;
page_t *page;
rec_t *rec;
rec_t *next_rec;
ulint fold;
ulint next_fold;
ulint n_cached;
ulint n_recs;
ulint *folds;
rec_t **recs;
ulint i;
mem_heap_t *heap = nullptr;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint *offsets = offsets_;
if (index->disable_ahi || !btr_search_enabled) {
return;
}
rec_offs_init(offsets_);
ut_ad(index);
ut_ad(block->page.id.space() == index->space);
ut_a(!dict_index_is_ibuf(index));
ut_ad(!rw_lock_own(btr_get_search_latch(index), RW_LOCK_X));
ut_ad(rw_lock_own(&(block->lock), RW_LOCK_S) ||
rw_lock_own(&(block->lock), RW_LOCK_X));
btr_search_s_lock(index);
table = btr_get_search_table(index);
page = buf_block_get_frame(block);
if (block->index &&
((block->curr_n_fields != n_fields) || (block->curr_n_bytes != n_bytes) ||
(block->curr_left_side != left_side))) {
btr_search_s_unlock(index);
btr_search_drop_page_hash_index(block);
} else {
btr_search_s_unlock(index);
}
/* Check that the values for hash index build are sensible */
if (n_fields == 0 && n_bytes == 0) {
return;
}
if (dict_index_get_n_unique_in_tree(index) <
btr_search_get_n_fields(n_fields, n_bytes)) {
return;
}
n_recs = page_get_n_recs(page);
if (n_recs == 0) {
return;
}
/* Calculate and cache fold values and corresponding records into
an array for fast insertion to the hash index */
folds = (ulint *)ut_malloc_nokey(n_recs * sizeof(ulint));
recs = (rec_t **)ut_malloc_nokey(n_recs * sizeof(rec_t *));
n_cached = 0;
ut_a(index->id == btr_page_get_index_id(page));
rec = page_rec_get_next(page_get_infimum_rec(page));
offsets = rec_get_offsets(rec, index, offsets,
btr_search_get_n_fields(n_fields, n_bytes), &heap);
ut_ad(page_rec_is_supremum(rec) ||
n_fields + (n_bytes > 0) == rec_offs_n_fields(offsets));
const ulint index_fold =
btr_search_fold_index_id(block->page.id.space(), index->id);
fold = rec_fold(rec, offsets, n_fields, n_bytes, index_fold, index);
if (left_side) {
folds[n_cached] = fold;
recs[n_cached] = rec;
n_cached++;
}
for (;;) {
next_rec = page_rec_get_next(rec);
if (page_rec_is_supremum(next_rec)) {
if (!left_side) {
folds[n_cached] = fold;
recs[n_cached] = rec;
n_cached++;
}
break;
}
offsets =
rec_get_offsets(next_rec, index, offsets,
btr_search_get_n_fields(n_fields, n_bytes), &heap);
next_fold =
rec_fold(next_rec, offsets, n_fields, n_bytes, index_fold, index);
if (fold != next_fold) {
/* Insert an entry into the hash index */
if (left_side) {
folds[n_cached] = next_fold;
recs[n_cached] = next_rec;
n_cached++;
} else {
folds[n_cached] = fold;
recs[n_cached] = rec;
n_cached++;
}
}
rec = next_rec;
fold = next_fold;
}
btr_search_check_free_space_in_heap(index);
btr_search_x_lock(index);
if (!btr_search_enabled) {
goto exit_func;
}
if (block->index &&
((block->curr_n_fields != n_fields) || (block->curr_n_bytes != n_bytes) ||
(block->curr_left_side != left_side))) {
goto exit_func;
}
/* This counter is decremented every time we drop page
hash index entries and is incremented here. Since we can
rebuild hash index for a page that is already hashed, we
have to take care not to increment the counter in that
case. * /
if (!block->index) {
assert_block_ahi_empty(block);
index->search_info->ref_count++;
}
block->n_hash_helps = 0;
block->curr_n_fields = n_fields;
block->curr_n_bytes = n_bytes;
block->curr_left_side = left_side;
block->index = index;
for (i = 0; i < n_cached; i++) {
ha_insert_for_fold(table, folds[i], block, recs[i]);
}
MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED);
MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached);
exit_func:
assert_block_ahi_valid(block);
btr_search_x_unlock(index);
ut_free(folds);
ut_free(recs);
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
}
基本的用法是这样,细节太多了。还是需要自己去抓住头绪理清。
五、Log Buffer
这个除了提高速度外,顺道也可以保证一些数据的完整性。
首先是写入缓冲区:
lsn_t log_buffer_write(log_t &log, const Log_handle &handle, const byte *str,
size_t str_len, lsn_t start_lsn) {
ut_ad(rw_lock_own(log.sn_lock_inst, RW_LOCK_S));
ut_a(log.buf != nullptr);
ut_a(log.buf_size > 0);
ut_a(log.buf_size % OS_FILE_LOG_BLOCK_SIZE == 0);
ut_a(str != nullptr);
ut_a(str_len > 0);
/* We should first resize the log buffer, if str_len is that big. */
ut_a(str_len < log.buf_size_sn.load());
/* The start_lsn points a data byte (not a header of log block). */
ut_a(log_lsn_validate(start_lsn));
/* We neither write with holes, nor overwrite any fragments of data. */
ut_ad(log.write_lsn.load() <= start_lsn);
ut_ad(log_buffer_ready_for_write_lsn(log) <= start_lsn);
/* That's only used in the assertion at the very end. */
const lsn_t end_sn = log_translate_lsn_to_sn(start_lsn) + str_len;
/* A guard used to detect when we should wrap (to avoid overflowing
outside the log buffer). */
byte *buf_end = log.buf + log.buf_size;
/* Pointer to next data byte to set within the log buffer. */
byte *ptr = log.buf + (start_lsn % log.buf_size);
/* Lsn value for the next byte to copy. */
lsn_t lsn = start_lsn;
/* Copy log records to the reserved space in the log buffer.
Decrease number of bytes to copy (str_len) after some are
copied. Proceed until number of bytes to copy reaches zero. */
while (true) {
/* Calculate offset from the beginning of log block. */
const auto offset = lsn % OS_FILE_LOG_BLOCK_SIZE;
ut_a(offset >= LOG_BLOCK_HDR_SIZE);
ut_a(offset < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE);
/* Calculate how many free data bytes are available
within current log block. */
const auto left = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE - offset;
ut_a(left > 0);
ut_a(left < OS_FILE_LOG_BLOCK_SIZE);
size_t len, lsn_diff;
if (left > str_len) {
/* There are enough free bytes to finish copying
the remaining part, leaving at least single free
data byte in the log block. */
len = str_len;
lsn_diff = str_len;
} else {
/* We have more to copy than the current log block
has remaining data bytes, or exactly the same.
In both cases, next lsn value will belong to the
next log block. Copy data up to the end of the
current log block and start a next iteration if
there is more to copy. */
len = left;
lsn_diff = left + LOG_BLOCK_TRL_SIZE + LOG_BLOCK_HDR_SIZE;
}
ut_a(len > 0);
ut_a(ptr + len <= buf_end);
LOG_SYNC_POINT("log_buffer_write_before_memcpy");
/* This is the critical memcpy operation, which copies data
from internal mtr's buffer to the shared log buffer. */
std::memcpy(ptr, str, len);
ut_a(len <= str_len);
str_len -= len;
str += len;
lsn += lsn_diff;
ptr += lsn_diff;
ut_a(log_lsn_validate(lsn));
if (ptr >= buf_end) {
/* Wrap - next copy operation will write at the
beginning of the log buffer. */
ptr -= log.buf_size;
}
if (lsn_diff > left) {
/* We have crossed boundaries between consecutive log
blocks. Either we finish in next block, in which case
user will set the proper first_rec_group field after
this function is finished, or we finish even further,
in which case next block should have 0. In both cases,
we reset next block's value to 0 now, and in the first
case, user will simply overwrite it afterwards. */
ut_a((uintptr_t(ptr) % OS_FILE_LOG_BLOCK_SIZE) == LOG_BLOCK_HDR_SIZE);
ut_a((uintptr_t(ptr) & ~uintptr_t(LOG_BLOCK_HDR_SIZE)) %
OS_FILE_LOG_BLOCK_SIZE ==
0);
log_block_set_first_rec_group(
reinterpret_cast<byte *>(uintptr_t(ptr) &
~uintptr_t(LOG_BLOCK_HDR_SIZE)),
0);
if (str_len == 0) {
/* We have finished at the boundary. */
break;
}
} else {
/* Nothing more to copy - we have finished! */
break;
}
}
ut_a(ptr >= log.buf);
ut_a(ptr <= buf_end);
ut_a(buf_end == log.buf + log.buf_size);
ut_a(log_translate_lsn_to_sn(lsn) == end_sn);
return (lsn);
}
void log_buffer_write_completed(log_t &log, const Log_handle &handle,
lsn_t start_lsn, lsn_t end_lsn) {
ut_ad(rw_lock_own(log.sn_lock_inst, RW_LOCK_S));
ut_a(log_lsn_validate(start_lsn));
ut_a(log_lsn_validate(end_lsn));
ut_a(end_lsn > start_lsn);
/* Let M = log.recent_written_size (number of slots).
For any integer k, all lsn values equal to: start_lsn + k*M
correspond to the same slot, and only the smallest of them
may use the slot. At most one of them can fit the range
[log.buf_ready_for_write_lsn..log.buf_ready_ready_write_lsn+M).
Any smaller values have already used the slot. Hence, we just
need to wait until start_lsn will fit the mentioned range. */
uint64_t wait_loops = 0;
while (!log.recent_written.has_space(start_lsn)) {
os_event_set(log.writer_event);
++wait_loops;
std::this_thread::sleep_for(std::chrono::microseconds(20));
}
if (unlikely(wait_loops != 0)) {
MONITOR_INC_VALUE(MONITOR_LOG_ON_RECENT_WRITTEN_WAIT_LOOPS, wait_loops);
}
/* Disallow reordering of writes to log buffer after this point.
This is actually redundant, because we use seq_cst inside the
log.recent_written.add_link(). However, we've decided to leave
the seperate acq-rel synchronization between user threads and
log writer. Reasons:
1. Not to rely on internals of Link_buf::add_link.
2. Stress that this synchronization is required in
case someone decided to weaken memory ordering
inside Link_buf. */
std::atomic_thread_fence(std::memory_order_release);
LOG_SYNC_POINT("log_buffer_write_completed_before_store");
ut_ad(log.write_lsn.load() <= start_lsn);
ut_ad(log_buffer_ready_for_write_lsn(log) <= start_lsn);
/* Note that end_lsn will not point to just before footer,
because we have already validated that end_lsn is valid. */
log.recent_written.add_link_advance_tail(start_lsn, end_lsn);
/* if someone is waiting for, set the event. (if possible) * /
lsn_t ready_lsn = log_buffer_ready_for_write_lsn(log);
if (log.current_ready_waiting_lsn > 0 &&
log.current_ready_waiting_lsn <= ready_lsn &&
!os_event_is_set(log.closer_event) &&
log_closer_mutex_enter_nowait(log) == 0) {
if (log.current_ready_waiting_lsn > 0 &&
log.current_ready_waiting_lsn <= ready_lsn &&
!os_event_is_set(log.closer_event)) {
log.current_ready_waiting_lsn = 0;
os_event_set(log.closer_event);
}
log_closer_mutex_exit(log);
}
}
然后把缓冲区写入到硬盘:
void log_writer(log_t *log_ptr) {
ut_a(log_ptr != nullptr);
log_t &log = *log_ptr;
lsn_t ready_lsn = 0;
log_writer_mutex_enter(log);
Log_thread_waiting waiting{log, log.writer_event, srv_log_writer_spin_delay,
srv_log_writer_timeout};
Log_write_to_file_requests_monitor write_to_file_requests_monitor{log};
for (uint64_t step = 0;; ++step) {
bool released = false;
auto stop_condition = [&ready_lsn, &log, &released,
&write_to_file_requests_monitor](bool wait) {
if (released) {
log_writer_mutex_enter(log);
released = false;
}
/* Advance lsn up to which data is ready in log buffer. */
log_advance_ready_for_write_lsn(log);
ready_lsn = log_buffer_ready_for_write_lsn(log);
/* Wait until any of following conditions holds:
1) There is some unwritten data in log buffer
2) We should close threads. */
if (log.write_lsn.load() < ready_lsn || log.should_stop_threads.load()) {
return (true);
}
if (UNIV_UNLIKELY(
log.writer_threads_paused.load(std::memory_order_acquire))) {
return (true);
}
if (wait) {
write_to_file_requests_monitor.update();
log_writer_mutex_exit(log);
released = true;
}
return (false);
};
const auto wait_stats = waiting.wait(stop_condition);
MONITOR_INC_WAIT_STATS(MONITOR_LOG_WRITER_, wait_stats);
if (UNIV_UNLIKELY(
log.writer_threads_paused.load(std::memory_order_acquire) &&
!log.should_stop_threads.load())) {
log_writer_mutex_exit(log);
os_event_wait(log.writer_threads_resume_event);
log_writer_mutex_enter(log);
ready_lsn = log_buffer_ready_for_write_lsn(log);
}
/* Do the actual work. */
if (log.write_lsn.load() < ready_lsn) {
log_writer_write_buffer(log, ready_lsn);
if (step % 1024 == 0) {
write_to_file_requests_monitor.update();
log_writer_mutex_exit(log);
std::this_thread::sleep_for(std::chrono::seconds(0));
log_writer_mutex_enter(log);
}
} else {
if (log.should_stop_threads.load()) {
/* When log threads are stopped, we must first
ensure that all writes to log buffer have been
finished and only then we are allowed to set
the should_stop_threads to true. * /
log_advance_ready_for_write_lsn(log);
ready_lsn = log_buffer_ready_for_write_lsn(log);
if (log.write_lsn.load() == ready_lsn) {
break;
}
}
}
}
log_writer_mutex_exit(log);
}
void log_write_notifier(log_t *log_ptr) {
ut_a(log_ptr != nullptr);
log_t &log = *log_ptr;
lsn_t lsn = log.write_lsn.load() + 1;
log_write_notifier_mutex_enter(log);
Log_thread_waiting waiting{log, log.write_notifier_event,
srv_log_write_notifier_spin_delay,
srv_log_write_notifier_timeout};
for (uint64_t step = 0;; ++step) {
if (log.should_stop_threads.load()) {
if (!log_writer_is_active()) {
if (lsn > log.write_lsn.load()) {
ut_a(lsn == log.write_lsn.load() + 1);
break;
}
}
}
if (UNIV_UNLIKELY(
log.writer_threads_paused.load(std::memory_order_acquire))) {
log_write_notifier_mutex_exit(log);
os_event_wait(log.writer_threads_resume_event);
ut_ad(log.write_notifier_resume_lsn.load(std::memory_order_acquire) + 1 >=
lsn);
lsn = log.write_notifier_resume_lsn.load(std::memory_order_acquire) + 1;
/* clears to acknowledge * /
log.write_notifier_resume_lsn.store(0, std::memory_order_release);
log_write_notifier_mutex_enter(log);
}
LOG_SYNC_POINT("log_write_notifier_before_check");
bool released = false;
auto stop_condition = [&log, lsn, &released](bool wait) {
LOG_SYNC_POINT("log_write_notifier_after_event_reset");
if (released) {
log_write_notifier_mutex_enter(log);
released = false;
}
LOG_SYNC_POINT("log_write_notifier_before_check");
if (log.write_lsn.load() >= lsn) {
return (true);
}
if (log.should_stop_threads.load()) {
if (!log_writer_is_active()) {
return (true);
}
}
if (UNIV_UNLIKELY(
log.writer_threads_paused.load(std::memory_order_acquire))) {
return (true);
}
if (wait) {
log_write_notifier_mutex_exit(log);
released = true;
}
LOG_SYNC_POINT("log_write_notifier_before_wait");
return (false);
};
const auto wait_stats = waiting.wait(stop_condition);
MONITOR_INC_WAIT_STATS(MONITOR_LOG_WRITE_NOTIFIER_, wait_stats);
LOG_SYNC_POINT("log_write_notifier_before_write_lsn");
const lsn_t write_lsn = log.write_lsn.load();
const lsn_t notified_up_to_lsn =
ut_uint64_align_up(write_lsn, OS_FILE_LOG_BLOCK_SIZE);
while (lsn <= notified_up_to_lsn) {
const auto slot = log_compute_write_event_slot(log, lsn);
lsn += OS_FILE_LOG_BLOCK_SIZE;
LOG_SYNC_POINT("log_write_notifier_before_notify");
os_event_set(log.write_events[slot]);
}
lsn = write_lsn + 1;
if (step % 1024 == 0) {
log_write_notifier_mutex_exit(log);
std::this_thread::sleep_for(std::chrono::seconds(0));
log_write_notifier_mutex_enter(log);
}
}
log_write_notifier_mutex_exit(log);
}
在MySql数据库读写的过程中,会不断的产生Redo Log,然后其被事务拆解成mtr并最终随其落到硬盘上。在前面分析的过程中知道,在log buffer 中为了提高效率使用了无锁化的日志提交。换句话说,从MySql8以后就从老版本的串行写日志更新到可以多线程并发写日志了。而为了解决这个问题,就引进了Link Buf这个数据结构。redo log在提交时,首先要通过原子锁来控制link_buf的相应位置,获得日志的相关原子变量,从而实现日志整个过程的lsn的连续性(即没有空穴)。
六、总结
内存缓冲是有限的,即使是大规模的服务器群,在目前的状态下,内存仍然是一种宝贵的资源。所以无论何种Buffer,其一定是大小受限,采用各种算法复用,重要数据采取不同的算法持久化到硬盘中去。所以学习缓冲,其实就是对缓冲的读写算法,持久化算法以及更新淘汰算法的一种设计思想的学习。再配合前面学习的数据结构,就可以完整清晰的分析设计者的思路。
其实设计思想和设计理念这些东西,对绝大多数人来说,是一个伪命题。为啥这样说,因为绝大多数人走不到这个境界。而且,并不是说一个人不聪明就不能走到这个境界。因为,对于软件的应用来说,更好像是武侠小说里的学习武术,只要不断练习,对武术的招术的理解就会起来越来越深。再辅以一些悟道的坚持,得窥门径还是有可能的,登堂入室也有希望。
其实就是一个反复明悟的过程,只看坚持的时间长短而已。可惜的是,现在软件开发的环境对开发人员不太友好,过短的职业生命,让更多的聪明人选择快餐吞食。是与非,这里不做评论,只是给大家一个建议罢了。