f2fs解析-1-元数据布局

13eholder

2026-03-12 (Updated: 2026-03-12)

磁盘布局

    磁盘结构代码描述参照f2fs_fs.h以及Linux 5.10代码实现

F2FS-Meta
以1GB磁盘镜像格式化为f2fs的参数来具体举例说明
参数配置:

{
"superblock": {
    "magic": "0xf2f52010",
    "major_ver": 1,
    "minor_ver": 9,
    "log_sectorsize": 9,
    "log_sectors_per_block": 3,
    "log_blocksize": 12,
    "log_blocks_per_seg": 9,
    "segs_per_sec": 1,
    "secs_per_zone": 1,
    "block_count": 250000,
    "section_count": 478,
    "segment_count": 487,
    "segment_count_ckpt": 2,
    "segment_count_sit": 2,
    "segment_count_nat": 4,
    "segment_count_ssa": 1,
    "segment_count_main": 478,
    "segment0_blkaddr": 512,
    "cp_blkaddr": 512,
    "sit_blkaddr": 1536,
    "nat_blkaddr": 2560,
    "ssa_blkaddr": 4608,
    "main_blkaddr": 5120,
    "root_ino": 3,
    "node_ino": 1,
    "meta_ino": 2,
    "block_size": 4096,
    "blocks_per_seg": 512,
    "volume_name": "\u7461\ufffd\ub8fc\uf649\u6cb4\u3ec0\u9ea9\u341f"
  }
}

section_count == segment_count_main: 表示Main区域即写入DATA/NODE 数据可用段数量
segment_count_{ckpt,sit,nat} 都是实际所需数量的两倍,采用双缓冲机制, 根据当前检查点标志位决定使用哪个缓冲区
blkaddr: 逻辑块号,转换成实际物理地址的时候需要乘 block_size

Superblock

superblock位于磁盘的第一个segement的前两个块,挂载文件系统时两个块都需要读取并进行校验

f2fs-sb

核心代码定义如下:

元数据各段磁盘占用
各分段的起始地址

检查点负载(负载决定检查点的磁盘布局)

struct f2fs_super_block {
	__le64 block_count;		/* total # of user blocks */
	__le32 section_count;		/* total # of sections */
	__le32 segment_count;		/* total # of segments */
	__le32 segment_count_ckpt;	/* # of segments for checkpoint */
	__le32 segment_count_sit;	/* # of segments for SIT */
	__le32 segment_count_nat;	/* # of segments for NAT */
	__le32 segment_count_ssa;	/* # of segments for SSA */
	__le32 segment_count_main;	/* # of segments for main area */
	__le32 cp_blkaddr;		/* start block address of checkpoint */
	__le32 sit_blkaddr;		/* start block address of SIT */
	__le32 nat_blkaddr;		/* start block address of NAT */
	__le32 ssa_blkaddr;		/* start block address of SSA */
	__le32 main_blkaddr;		/* start block address of main area */
	__le32 cp_payload;
	__le32 crc;			/* checksum of superblock */
} __packed;

CheckPoint

checkpoint布局根据磁盘容量略有不同,阈值大约在1.09TB

payload为0时 sit_bitmap 和 nat_bitmap内联在block 0
payload为1时 nat_bitmap内联在block 0; sit_bitmap单独存放在block 0之后的payload块上
如果存在孤儿节点,将孤儿节点写入Block 1~N
Block N+1 ~ N+6 存放当前使用的段摘要(DATA -> NODE; HOT->WARM->COLD)
HOT DATA和HOT NODE 对应的段摘要有Journal字段,可以存放8条 nat entry或 sit entry,用于避免频繁覆盖写入热点区域的sit block和nat block;

f2fs-cp

Segment Info Table

segment info table使用的块数在mkfs.f2fs创建文件系统时根据磁盘容量计算后写入到superblock就不再变化, sit管理main area段分配和段内块分配
curseg指向的段可以分配block;
满容量的段不会被GC回收;含有无效块的段可以被GC回收

struct f2fs_sit_entry {
	__le16 vblocks;				/* reference above */
	__u8 valid_map[SIT_VBLOCK_MAP_SIZE];	/* bitmap for valid blocks */
	__le64 mtime;				/* segment age for cleaning */
} __attribute__((packed));

struct f2fs_sit_block {
	struct f2fs_sit_entry entries[SIT_ENTRY_PER_BLOCK]; // 55
} __attribute__((packed));

Node Address Table

node是f2fs为了解决wandering tree引入的间接索引结构体

代码定义如下:

f2fs_node是一个联合体;既可以用来表示Inode,也可以表示内部磁盘块管理单元Node
f2fs_node具有唯一的内部表示nid; 管理f2fs_node的磁盘块就是node address table

f2fs_inode可以拥有多个node:

一个ino对应一个目录或普通文件
一个ino可以对应多个 nid

一个nid只属于一个ino

struct f2fs_nat_entry {
	__u8 version;		/* latest version of cached nat entry */
	__le32 ino;		/* inode number */
	__le32 block_addr;	/* block address */
} __packed;

struct f2fs_nat_block {
	struct f2fs_nat_entry entries[NAT_ENTRY_PER_BLOCK];
} __packed;

struct node_footer {
	__le32 nid;		/* node id */
	__le32 ino;		/* inode nunmber */
	__le32 flag;		/* include cold/fsync/dentry marks and offset */
	__le64 cp_ver;		/* checkpoint version */
	__le32 next_blkaddr;	/* next node page block address */
} __attribute__((packed));

struct f2fs_node {
	/* can be one of three types: inode, direct, and indirect types */
	union {
		struct f2fs_inode i;
		struct direct_node dn;
		struct indirect_node in;
	};
	struct node_footer footer;
} __attribute__((packed));

Inode磁盘布局如下: 每个目录和常规文件创建的时候，都只占一个磁盘块(分配在main区域),被标记为inline_inode

磁盘块头部存放元数据: mode,flag,time,size,name等
磁盘块的尾部存放固定数量的node索引: 2 single indirect, 2 double in-indirect, 1 triple-indirect (代码中管 single indirect叫 direct node; double和triple叫 indirect node)
中间的部分是一个结合体:
- 内联的数据(data或dir entry)
- 内联的拓展属性
- 直接指针(存放nid,指向其他node)

f2fs-node

代码定义如下:

#define F2FS_INLINE_XATTR   0x01    /* file inline xattr flag */
#define F2FS_INLINE_DATA    0x02    /* file inline data flag */
#define F2FS_INLINE_DENTRY  0x04    /* file inline dentry flag */

struct f2fs_inode {
	// some meta data, we just ignore that 
	union {
		struct {
			__le16 i_extra_isize;	/* extra inode attribute size */
			__le16 i_inline_xattr_size;	/* inline xattr size, unit: 4 bytes */
			__le32 i_projid;	/* project id */
			__le32 i_inode_checksum;/* inode meta checksum */
			__le64 i_crtime;	/* creation time */
			__le32 i_crtime_nsec;	/* creation time in nano scale */
			__le64 i_compr_blocks;	/* # of compressed blocks */
			__u8 i_compress_algorithm;	/* compress algorithm */
			__u8 i_log_cluster_size;	/* log of cluster size */
			__le16 i_padding;		/* padding */
			__le32 i_extra_end[0];	/* for attribute size calculation */
		} __packed;
		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
	};
	__le32 i_nid[DEF_NIDS_PER_INODE];	/* direct(2), indirect(2), double_indirect(1) node id */
} __attribute__((packed));

举例说明inode是如何一步一步膨胀的:

f2fs-inode

direct node和indirect node代码示例如下

struct direct_node {
	__le32 addr[ADDRS_PER_BLOCK];	/* array of data block address */
} __attribute__((packed));

struct indirect_node {
	__le32 nid[NIDS_PER_BLOCK];	/* array of data block address */
} __attribute__((packed));

Segment Summary Table

反向索引，用于GC

磁盘容量紧张，选定了segment x 回收，x上目前有7个有效块；根据sit table可以迅速定位到blk addr；blk所属的node需要感知到修改; 为了快速索引，引入了SSA

/* a summary entry for a 4KB-sized block in a segment */
struct f2fs_summary {
	 // - If data page, nid represents dnode's nid
	 // - If node page, nid represents the node page's nid.
	__le32 nid;		/* parent node id */
	union {
		__u8 reserved[3];
		struct {
			__u8 version;		/* node version number */
			__le16 ofs_in_node;	/* block index in parent node */
		} __attribute__((packed));
	};
} __attribute__((packed));

写入模式

元数据区域采用双缓冲机制维护数据一致性,采用Checkpoint Journal避免NAT和SIT的频繁更新；数据区域根据剩余磁盘容量决定使用追加写还是覆盖写

双缓冲

根据最新CheckPoint sit_nat_version_bitmap取值决定使用哪一组缓冲区:

当前使用第一组元数据; 新的修改会先缓存在内存中并标记为脏
sync或gc触发检查点,并且需要写回的元数据多(超过8条)
- 写入内存中的元数据到磁盘中的另外一组缓冲区
- 写入检查点并且标识使用新的缓冲区
使用新的检查点和新的缓冲区

即使在进行检查点操作时崩溃，不会影响另外一个检查点对应的数据和元数据

Checkpoint Journal

当前使用的热数据/节点区域进行的修改不超过8条时，为了避免小写入导致频繁地缓冲区翻转，f2fs将这些更改写入到检查点段摘要的日志区域,此时不需要翻转sit_nat_version_bitmap,仍使用原缓冲区

OPU & IPU

当Main Area可用空间较多时，总是分配一个新块进行写入；否则写回到原来的块(触发GC)