1. FAT表操作
FAT文件系统中,使用FAT表标记哪个cluster被占用,哪个没被占用。
在Linux内核代码中,与FAT表操作对应的是fat_entry,fatent_ops结构和fat_cache_id缓存等。
1.1 fat_entry
fat中的fat entry用于描述fat文件系统的FAT分配表。
/* fat/fatent.c */
struct fat_entry {
int entry; // 代表当前的簇索引
union { // 簇索引表
u8 *ent12_p[2];
__le16 *ent16_p;
__le32 *ent32_p;
} u;
int nr_bhs; // buffer_head数目,可能是1也可能是2,FAT32是1
struct buffer_head *bhs[2]; // FAT表的扇区的buffer_head
struct inode *fat_inode; // 超级块的inode
};
FAT文件系统中对FAT12/16/32,分别实现了一个fatent_operations。
fatent_operations的初始化在fat_ent_access_init函数,大意是根据超级块里的fat_bits判断当前FAT类型,然后将对应的fatent_operations赋值到超级块的fatent_ops中。
void fat_ent_access_init(struct super_block *sb)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
mutex_init(&sbi->fat_lock);
if (is_fat32(sbi)) {
sbi->fatent_shift = 2;
sbi->fatent_ops = &fat32_ops;
} else if (is_fat16(sbi)) {
sbi->fatent_shift = 1;
sbi->fatent_ops = &fat16_ops;
} else if (is_fat12(sbi)) {
sbi->fatent_shift = -1;
sbi->fatent_ops = &fat12_ops;
} else {
fat_fs_error(sb, "invalid FAT variant, %u bits", sbi->fat_bits);
}
}
对于FAT32,超级块的fatent_ops会指向fat32_ops;fatent_shift为2,代表一个簇的编号占用4个字节:
static const struct fatent_operations fat32_ops = {
.ent_blocknr = fat_ent_blocknr,
.ent_set_ptr = fat32_ent_set_ptr,
.ent_bread = fat_ent_bread,
.ent_get = fat32_ent_get,
.ent_put = fat32_ent_put,
.ent_next = fat32_ent_next,
};
1.1.1 fat_ent_blocknr
根据entry编号,得到block号和在block里的索引。entry为簇的索引,比如entry 5,对于FAT32它在FAT表的位置为5*4 + <FAT start>。
// bytes = 5 << 2 = 20
// offset = 20 & (512 - 1) = 20
// blocknr = fat_start + (20 >> 9) = fat_start
static void fat_ent_blocknr(struct super_block *sb, int entry,
int *offset, sector_t *blocknr)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
int bytes = (entry << sbi->fatent_shift); // FAT32中sbi->fatent_shift为2
WARN_ON(!fat_valid_entry(sbi, entry));
*offset = bytes & (sb->s_blocksize - 1); // block里的索引,s_blocksize是文件系统中数据块大小,以字节单位
*blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); // block号,s_blocksize_bits是上面的size大小占用位数,例如512字节就是9 bits
}
1.1.2 ent_set_ptr
设置fat_entry的u.ent32_p值。
static void fat32_ent_set_ptr(struct fat_entry *fatent, int offset)
{
WARN_ON(offset & (4 - 1));
fatent->u.ent32_p = (__le32 *)(fatent->bhs[0]->b_data + offset);
}
1.1.3 ent_bread
读FAT分配表,读到后保存到fat_entry的bhs,并将其offset开始放到u.ent32_p。
static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
int offset, sector_t blocknr)
{
const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
fatent->bhs[0] = sb_bread(sb, blocknr); // 注意这里bhs是个数组,FAT32只用里面第一个,所以下面的nr_bhs为1。
if (!fatent->bhs[0]) {
fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
(llu)blocknr);
return -EIO;
}
fatent->nr_bhs = 1;
ops->ent_set_ptr(fatent, offset);
return 0;
}
1.1.4 ent_get
得到当前簇号。
static int fat32_ent_get(struct fat_entry *fatent)
{
int next = le32_to_cpu(*fatent->u.ent32_p) & 0x0fffffff;
WARN_ON((unsigned long)fatent->u.ent32_p & (4 - 1));
if (next >= BAD_FAT32)
next = FAT_ENT_EOF;
return next;
}
1.1.5 ent_put
将FAT表中某个簇的位置写成new代表的值,即更新簇链,一般在释放簇的时候会用到。
static void fat32_ent_put(struct fat_entry *fatent, int new)
{
WARN_ON(new & 0xf0000000);
new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
*fatent->u.ent32_p = cpu_to_le32(new);
mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
}
1.1.6 ent_next
得到下一个簇,注意这里是整个FAT表的下一个簇,不是某一个文件的下一个簇。此函数在分配空闲簇、统计空闲簇是用于遍历整个FAT表。
static int fat32_ent_next(struct fat_entry *fatent)
{
const struct buffer_head *bh = fatent->bhs[0];
fatent->entry++;
if (fatent->u.ent32_p < (__le32 *)(bh->b_data + (bh->b_size - 4))) {
fatent->u.ent32_p++;
return 1;
}
fatent->u.ent32_p = NULL;
return 0;
}
1.2 fat_entry的对外接口
1.2.1 fat_get_cluster
fat_get_cluster根据cluster值得到该cluster在文件中以及硬盘中的簇号。这里的参数cluster是在文件中的簇号,fclus是返回的在文件中的簇号,dclus是在磁盘中的簇号。
int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
const int limit = sb->s_maxbytes >> sbi->cluster_bits;
struct fat_entry fatent;
struct fat_cache_id cid;
int nr;
BUG_ON(MSDOS_I(inode)->i_start == 0);
*fclus = 0;
*dclus = MSDOS_I(inode)->i_start;
if (!fat_valid_entry(sbi, *dclus)) {
fat_fs_error_ratelimit(sb,
"%s: invalid start cluster (i_pos %lld, start %08x)",
__func__, MSDOS_I(inode)->i_pos, *dclus);
return -EIO;
}
if (cluster == 0)
return 0;
// 在cache中查找与cluster最近的fclus和dclus
if (fat_cache_lookup(inode, cluster, &cid, fclus, dclus) < 0) {
/*
* dummy, always not contiguous
* This is reinitialized by cache_init(), later.
*/
cache_init(&cid, -1, -1);
}
// 初始化一个fat\_entry
fatent_init(&fatent);
while (*fclus < cluster) {
/* prevent the infinite loop of cluster chain */
if (*fclus > limit) {
fat_fs_error_ratelimit(sb,
"%s: detected the cluster chain loop (i_pos %lld)",
__func__, MSDOS_I(inode)->i_pos);
nr = -EIO;
goto out;
}
nr = fat_ent_read(inode, &fatent, *dclus);
if (nr < 0)
goto out;
else if (nr == FAT_ENT_FREE) {
fat_fs_error_ratelimit(sb,
"%s: invalid cluster chain (i_pos %lld)",
__func__, MSDOS_I(inode)->i_pos);
nr = -EIO;
goto out;
} else if (nr == FAT_ENT_EOF) {
fat_cache_add(inode, &cid);
goto out;
}
(*fclus)++;
*dclus = nr;
if (!cache_contiguous(&cid, *dclus))
cache_init(&cid, *fclus, *dclus);
}
nr = 0;
// 加入缓存
fat_cache_add(inode, &cid);
out:
fatent_brelse(&fatent);
return nr;
}
上面函数中,提到了fat_entry的缓存,其实现代码在fs/fat/cache.c,作用为根据文件中的簇号找到磁盘中的簇号。
另外,上文中调用了fat_ent_read函数,此函数执行一系列fatent_operations中的操作,例如读fat表中某一个block,查找该block中的cluster表索引,从而得到文件簇对应的磁盘簇。注意这里的读block并不一定是真正去磁盘上读,大多数情况下该block的内容会在block层的disk cache中得到。
1.2.2 fat_count_free_clusters
fat_count_free_clusters的作用是得到当前空闲的簇的总和,即磁盘上有多少剩余空间。此函数实现很简单,就是扫描FAT表,统计哪些簇是空闲的。扫描后的结果会保存在超级块的free_clusters中。
int fat_count_free_clusters(struct super_block *sb)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent;
struct fatent_ra fatent_ra;
int err = 0, free;
lock_fat(sbi);
if (sbi->free_clusters != -1 && sbi->free_clus_valid)
goto out;
free = 0;
fatent_init(&fatent);
fatent_set_entry(&fatent, FAT_START_ENT);
fat_ra_init(sb, &fatent_ra, &fatent, sbi->max_cluster);
while (fatent.entry < sbi->max_cluster) {
/* readahead of fat blocks */
fat_ent_reada(sb, &fatent_ra, &fatent);
err = fat_ent_read_block(sb, &fatent);
if (err)
goto out;
do {
if (ops->ent_get(&fatent) == FAT_ENT_FREE)
free++;
} while (fat_ent_next(sbi, &fatent));
cond_resched();
}
sbi->free_clusters = free;
sbi->free_clus_valid = 1;
mark_fsinfo_dirty(sb);
fatent_brelse(&fatent);
out:
unlock_fat(sbi);
return err;
}
1.2.3 fat_alloc_clusters
为某个文件(inode)分配新簇,这些新簇会链接到当前文件簇链的末尾。
int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent, prev_ent;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
int i, count, err, nr_bhs, idx_clus;
BUG_ON(nr_cluster > (MAX_BUF_PER_PAGE / 2)); /* fixed limit */
lock_fat(sbi);
if (sbi->free_clusters != -1 && sbi->free_clus_valid &&
sbi->free_clusters < nr_cluster) {
unlock_fat(sbi);
return -ENOSPC;
}
err = nr_bhs = idx_clus = 0;
count = FAT_START_ENT;
fatent_init(&prev_ent);
fatent_init(&fatent);
fatent_set_entry(&fatent, sbi->prev_free + 1);
while (count < sbi->max_cluster) {
if (fatent.entry >= sbi->max_cluster)
fatent.entry = FAT_START_ENT;
fatent_set_entry(&fatent, fatent.entry);
err = fat_ent_read_block(sb, &fatent);
if (err)
goto out;
/* Find the free entries in a block */
do {
if (ops->ent_get(&fatent) == FAT_ENT_FREE) {
int entry = fatent.entry;
/* make the cluster chain */
ops->ent_put(&fatent, FAT_ENT_EOF);
if (prev_ent.nr_bhs)
ops->ent_put(&prev_ent, entry);
fat_collect_bhs(bhs, &nr_bhs, &fatent);
sbi->prev_free = entry;
if (sbi->free_clusters != -1)
sbi->free_clusters--;
cluster[idx_clus] = entry;
idx_clus++;
if (idx_clus == nr_cluster)
goto out;
/*
* fat_collect_bhs() gets ref-count of bhs,
* so we can still use the prev_ent.
*/
prev_ent = fatent;
}
count++;
if (count == sbi->max_cluster)
break;
} while (fat_ent_next(sbi, &fatent));
}
/* Couldn't allocate the free entries */
sbi->free_clusters = 0;
sbi->free_clus_valid = 1;
err = -ENOSPC;
out:
unlock_fat(sbi);
mark_fsinfo_dirty(sb);
fatent_brelse(&fatent);
if (!err) {
if (inode_needs_sync(inode))
err = fat_sync_bhs(bhs, nr_bhs);
if (!err)
err = fat_mirror_bhs(sb, bhs, nr_bhs);
}
for (i = 0; i < nr_bhs; i++)
brelse(bhs[i]);
if (err && idx_clus)
fat_free_clusters(inode, cluster[0]);
return err;
}
1.2.4 fat_ent_write
更新FAT表,这里的更新包括两部分,一部分为主FAT,另一部分为mirror FAT。
int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
int new, int wait)
{
struct super_block *sb = inode->i_sb;
const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
int err;
ops->ent_put(fatent, new);
if (wait) {
err = fat_sync_bhs(fatent->bhs, fatent->nr_bhs);
if (err)
return err;
}
return fat_mirror_bhs(sb, fatent->bhs, fatent->nr_bhs);
}
1.2.5 fat_free_clusters
释放FAT表某个簇标记,更新包括两部分,一部分为主FAT,另一部分为mirror FAT。
int fat_free_clusters(struct inode *inode, int cluster)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
int i, err, nr_bhs;
int first_cl = cluster, dirty_fsinfo = 0;
nr_bhs = 0;
fatent_init(&fatent);
lock_fat(sbi);
do {
cluster = fat_ent_read(inode, &fatent, cluster);
if (cluster < 0) {
err = cluster;
goto error;
} else if (cluster == FAT_ENT_FREE) {
fat_fs_error(sb, "%s: deleting FAT entry beyond EOF",
__func__);
err = -EIO;
goto error;
}
if (sbi->options.discard) {
/*
* Issue discard for the sectors we no longer
* care about, batching contiguous clusters
* into one request
*/
if (cluster != fatent.entry + 1) {
int nr_clus = fatent.entry - first_cl + 1;
sb_issue_discard(sb,
fat_clus_to_blknr(sbi, first_cl),
nr_clus * sbi->sec_per_clus,
GFP_NOFS, 0);
first_cl = cluster;
}
}
ops->ent_put(&fatent, FAT_ENT_FREE);
if (sbi->free_clusters != -1) {
sbi->free_clusters++;
dirty_fsinfo = 1;
}
if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
if (sb->s_flags & SB_SYNCHRONOUS) {
err = fat_sync_bhs(bhs, nr_bhs);
if (err)
goto error;
}
err = fat_mirror_bhs(sb, bhs, nr_bhs);
if (err)
goto error;
for (i = 0; i < nr_bhs; i++)
brelse(bhs[i]);
nr_bhs = 0;
}
fat_collect_bhs(bhs, &nr_bhs, &fatent);
} while (cluster != FAT_ENT_EOF);
if (sb->s_flags & SB_SYNCHRONOUS) {
err = fat_sync_bhs(bhs, nr_bhs);
if (err)
goto error;
}
err = fat_mirror_bhs(sb, bhs, nr_bhs);
error:
fatent_brelse(&fatent);
for (i = 0; i < nr_bhs; i++)
brelse(bhs[i]);
unlock_fat(sbi);
if (dirty_fsinfo)
mark_fsinfo_dirty(sb);
return err;
}
EXPORT_SYMBOL_GPL(fat_free_clusters);
2. file_operations
FAT文件系统使用的file_operations结构为fat_file_operations。
const struct file_operations fat_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter, // 读操作
.write_iter = generic_file_write_iter, // 写操作
.mmap = generic_file_mmap,
.release = fat_file_release,
.unlocked_ioctl = fat_generic_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = fat_fallocate,
};
先看一下FAT的读写函数的实现。
2.1 读操作
generic_file_read_iter是kernel提供的通用读函数,最终会调用到file_operations的异步读写函数实现。
/**
* generic_file_read_iter - generic filesystem read routine
* @iocb: kernel I/O control block
* @iter: destination for the data read
*
* This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
*
* The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
* be returned when no data can be read without waiting for I/O requests
* to complete; it doesn't prevent readahead.
*
* The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
* requests shall be made for the read or for readahead. When no data
* can be read, -EAGAIN shall be returned. When readahead would be
* triggered, a partial, possibly empty read shall be returned.
*
* Return:
* * number of bytes copied, even for partial reads
* * negative error code (or 0 if IOCB_NOIO) if nothing was read
*/
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
size_t count = iov_iter_count(iter);
ssize_t retval = 0;
if (!count)
goto out; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
loff_t size;
size = i_size_read(inode);
if (iocb->ki_flags & IOCB_NOWAIT) {
if (filemap_range_has_page(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1))
return -EAGAIN;
} else {
retval = filemap_write_and_wait_range(mapping,
iocb->ki_pos,
iocb->ki_pos + count - 1);
if (retval < 0)
goto out;
}
file_accessed(file);
retval = mapping->a_ops->direct_IO(iocb, iter);
if (retval >= 0) {
iocb->ki_pos += retval;
count -= retval;
}
iov_iter_revert(iter, count - iov_iter_count(iter));
/*
* Btrfs can have a short DIO read if we encounter
* compressed extents, so if there was an error, or if
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
if (retval < 0 || !count || iocb->ki_pos >= size ||
IS_DAX(inode))
goto out;
}
retval = generic_file_buffered_read(iocb, iter, retval);
out:
return retval;
}
EXPORT_SYMBOL(generic_file_read_iter);
2.2 写操作
写操作与读操作类似:同步调用函数由异步调用函数实现。
/**
* __generic_file_write_iter - write data to a file
* @iocb: IO state structure (file, offset, etc.)
* @from: iov_iter with data to write
*
* This function does all the work needed for actually writing data to a
* file. It does all basic checks, removes SUID from the file, updates
* modification times and calls proper subroutines depending on whether we
* do direct IO or a standard buffered write.
*
* It expects i_mutex to be grabbed unless we work on a block device or similar
* object which does not need locking at all.
*
* This function does *not* take care of syncing data in case of O_SYNC write.
* A caller has to handle it. This is mainly due to the fact that we want to
* avoid syncing under i_mutex.
*
* Return:
* * number of bytes written, even for truncated writes
* * negative error code if no data has been written at all
*/
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t written = 0;
ssize_t err;
ssize_t status;
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
err = file_remove_privs(file);
if (err)
goto out;
err = file_update_time(file);
if (err)
goto out;
if (iocb->ki_flags & IOCB_DIRECT) {
loff_t pos, endbyte;
written = generic_file_direct_write(iocb, from);
/*
* If the write stopped short of completing, fall back to
* buffered writes. Some filesystems do this for writes to
* holes, for example. For DAX files, a buffered write will
* not succeed (even if it did, DAX does not handle dirty
* page-cache pages correctly).
*/
if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
goto out;
status = generic_perform_write(file, from, pos = iocb->ki_pos);
/*
* If generic_perform_write() returned a synchronous error
* then we want to return the number of bytes which were
* direct-written, or the error code if that was zero. Note
* that this differs from normal direct-io semantics, which
* will return -EFOO even if some bytes were written.
*/
if (unlikely(status < 0)) {
err = status;
goto out;
}
/*
* We need to ensure that the page cache pages are written to
* disk and invalidated to preserve the expected O_DIRECT
* semantics.
*/
endbyte = pos + status - 1;
err = filemap_write_and_wait_range(mapping, pos, endbyte);
if (err == 0) {
iocb->ki_pos = endbyte + 1;
written += status;
invalidate_mapping_pages(mapping,
pos >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
} else {
/*
* We don't know how much we wrote, so just return
* the number of bytes which were direct-written
*/
}
} else {
written = generic_perform_write(file, from, iocb->ki_pos);
if (likely(written > 0))
iocb->ki_pos += written;
}
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
EXPORT_SYMBOL(__generic_file_write_iter);
/**
* generic_file_write_iter - write data to a file
* @iocb: IO state structure
* @from: iov_iter with data to write
*
* This is a wrapper around __generic_file_write_iter() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
* and acquires i_mutex as needed.
* Return:
* * negative error code if no data has been written at all of
* vfs_fsync_range() failed for a synchronous write
* * number of bytes written, even for truncated writes
*/
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret > 0)
ret = __generic_file_write_iter(iocb, from);
inode_unlock(inode);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
return ret;
}
EXPORT_SYMBOL(generic_file_write_iter);
2.3 seek
seek函数的实现使用的是generic_file_llseek。
/**
* generic_file_llseek_size - generic llseek implementation for regular files
* @file: file structure to seek on
* @offset: file offset to seek to
* @whence: type of seek
* @size: max size of this file in file system
* @eof: offset used for SEEK_END position
*
* This is a variant of generic_file_llseek that allows passing in a custom
* maximum file size and a custom EOF position, for e.g. hashed directories
*
* Synchronization:
* SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
* SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
* read/writes behave like SEEK_SET against seeks.
*/
loff_t
generic_file_llseek_size(struct file *file, loff_t offset, int whence,
loff_t maxsize, loff_t eof)
{
switch (whence) {
case SEEK_END:
offset += eof;
break;
case SEEK_CUR:
/*
* Here we special-case the lseek(fd, 0, SEEK_CUR)
* position-querying operation. Avoid rewriting the "same"
* f_pos value back to the file because a concurrent read(),
* write() or lseek() might have altered it
*/
if (offset == 0)
return file->f_pos;
/*
* f_lock protects against read/modify/write race with other
* SEEK_CURs. Note that parallel writes and reads behave
* like SEEK_SET.
*/
spin_lock(&file->f_lock);
offset = vfs_setpos(file, file->f_pos + offset, maxsize);
spin_unlock(&file->f_lock);
return offset;
case SEEK_DATA:
/*
* In the generic case the entire file is data, so as long as
* offset isn't at the end of the file then the offset is data.
*/
if ((unsigned long long)offset >= eof)
return -ENXIO;
break;
case SEEK_HOLE:
/*
* There is a virtual hole at the end of the file, so as long as
* offset isn't i_size or larger, return i_size.
*/
if ((unsigned long long)offset >= eof)
return -ENXIO;
offset = eof;
break;
}
return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);
/**
* generic_file_llseek - generic llseek implementation for regular files
* @file: file structure to seek on
* @offset: file offset to seek to
* @whence: type of seek
*
* This is a generic implemenation of ->llseek useable for all normal local
* filesystems. It just updates the file offset to the value specified by
* @offset and @whence.
*/
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
return generic_file_llseek_size(file, offset, whence,
inode->i_sb->s_maxbytes,
i_size_read(inode));
}
EXPORT_SYMBOL(generic_file_llseek);
2.4 mmap
mmap的实现使用的是generic_file_mmap。
/* This is used for a general mmap of a disk file */
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
struct address_space *mapping = file->f_mapping;
if (!mapping->a_ops->readpage)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = filemap_page_mkwrite,
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
.allow_speculation = filemap_allow_speculation,
#endif
};
generic_file_vm_ops中实现了fault函数,fault会在处理缺页异常时被调用。
/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
*
* filemap_fault() is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault.
*
* The goto's are kind of ugly, but this streamlines the normal case of having
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*
* vma->vm_mm->mmap_lock must be held on entry.
*
* If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
* may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
*
* If our return value does not have VM_FAULT_RETRY set, the mmap_lock
* has not been released.
*
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
*
* Return: bitwise-OR of %VM_FAULT_ codes.
*/
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
int error;
struct file *file = vmf->vma->vm_file;
struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
pgoff_t max_off;
struct page *page = NULL;
vm_fault_t ret = 0;
bool retry = false;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off))
return VM_FAULT_SIGBUS;
trace_android_vh_filemap_fault_get_page(vmf, &page, &retry);
if (unlikely(retry))
goto out_retry;
if (unlikely(page))
goto page_ok;
/*
* Do we have something in the page cache already?
*/
page = find_get_page(mapping, offset);
if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
/*
* We found the page, so try async readahead before
* waiting for the lock.
*/
fpin = do_async_mmap_readahead(vmf, page);
} else if (!page) {
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
fpin = do_sync_mmap_readahead(vmf);
retry_find:
page = pagecache_get_page(mapping, offset,
FGP_CREAT|FGP_FOR_MMAP,
vmf->gfp_mask);
if (!page) {
if (fpin)
goto out_retry;
return VM_FAULT_OOM;
}
}
if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
goto out_retry;
/* Did it get truncated? */
if (unlikely(compound_head(page)->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto retry_find;
}
VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
*/
if (unlikely(!PageUptodate(page)))
goto page_not_uptodate;
/*
* We've made it this far and we had to drop our mmap_lock, now is the
* time to return to the upper layer and have it re-find the vma and
* redo the fault.
*/
if (fpin) {
unlock_page(page);
goto out_retry;
}
page_ok:
/*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off)) {
unlock_page(page);
put_page(page);
return VM_FAULT_SIGBUS;
}
vmf->page = page;
return ret | VM_FAULT_LOCKED;
page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
ClearPageError(page);
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
error = mapping->a_ops->readpage(file, page);
if (!error) {
wait_on_page_locked(page);
if (!PageUptodate(page))
error = -EIO;
}
if (fpin)
goto out_retry;
put_page(page);
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
shrink_readahead_size_eio(ra);
return VM_FAULT_SIGBUS;
out_retry:
/*
* We dropped the mmap_lock, we need to return to the fault handler to
* re-find the vma and come back and find our hopefully still populated
* page.
*/
if (page) {
trace_android_vh_filemap_fault_cache_page(vmf, page);
put_page(page);
}
if (fpin)
fput(fpin);
return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);
3. address_space_operations
file_operations结构中提到了aops结构,此结构在FAT文件系统中为fat_aops。
static const struct address_space_operations fat_aops = {
.readpage = fat_readpage,
.readahead = fat_readahead,
.writepage = fat_writepage,
.writepages = fat_writepages,
.write_begin = fat_write_begin,
.write_end = fat_write_end,
.direct_IO = fat_direct_IO,
.bmap = _fat_bmap
};
readpage:
static int fat_readpage(struct file *file, struct page *page)
{
return mpage_readpage(page, fat_get_block);
}
/*
* This isn't called much at all
*/
int mpage_readpage(struct page *page, get_block_t get_block)
{
struct mpage_readpage_args args = {
.page = page,
.nr_pages = 1,
.get_block = get_block,
};
args.bio = do_mpage_readpage(&args);
if (args.bio)
mpage_bio_submit(REQ_OP_READ, 0, args.bio);
return 0;
}
EXPORT_SYMBOL_NS(mpage_readpage, ANDROID_GKI_VFS_EXPORT_ONLY);