X-Git-Url: https://main.carlh.net/gitweb/?a=blobdiff_plain;f=lwext4%2Fext4_journal.c;h=86366a1bf044a5792dea5395908bc37a6c7f104c;hb=ea7ce7dc4a1cb5af01175324c5da8dceb3f8652d;hp=b3c1d0237f01fed374a0eda59941625f86235204;hpb=43da1b2f9a9b6922369cc47e265cac3c93112325;p=lwext4.git diff --git a/lwext4/ext4_journal.c b/lwext4/ext4_journal.c index b3c1d02..86366a1 100644 --- a/lwext4/ext4_journal.c +++ b/lwext4/ext4_journal.c @@ -1,26 +1,349 @@ +/* + * Copyright (c) 2015 Grzegorz Kostka (kostka.grzegorz@gmail.com) + * Copyright (c) 2015 Kaho Ng (ngkaho1234@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup lwext4 + * @{ + */ /** * @file ext4_journal.c - * @brief Journalling + * @brief Journal handle functions */ #include "ext4_config.h" #include "ext4_types.h" #include "ext4_fs.h" #include "ext4_super.h" +#include "ext4_journal.h" #include "ext4_errno.h" #include "ext4_blockdev.h" -#include "ext4_crc32c.h" +#include "ext4_crc32.h" #include "ext4_debug.h" -#include "tree.h" #include -#include +#include -int jbd_inode_bmap(struct jbd_fs *jbd_fs, - ext4_lblk_t iblock, - ext4_fsblk_t *fblock); +/**@brief Revoke entry during journal replay.*/ +struct revoke_entry { + /**@brief Block number not to be replayed.*/ + ext4_fsblk_t block; + + /**@brief For any transaction id smaller + * than trans_id, records of @block + * in those transactions should not + * be replayed.*/ + uint32_t trans_id; + + /**@brief Revoke tree node.*/ + RB_ENTRY(revoke_entry) revoke_node; +}; + +/**@brief Valid journal replay information.*/ +struct recover_info { + /**@brief Starting transaction id.*/ + uint32_t start_trans_id; + + /**@brief Ending transaction id.*/ + uint32_t last_trans_id; + + /**@brief Used as internal argument.*/ + uint32_t this_trans_id; + + /**@brief No of transactions went through.*/ + uint32_t trans_cnt; + + /**@brief RB-Tree storing revoke entries.*/ + RB_HEAD(jbd_revoke, revoke_entry) revoke_root; +}; + +/**@brief Journal replay internal arguments.*/ +struct replay_arg { + /**@brief Journal replay information.*/ + struct recover_info *info; + + /**@brief Current block we are on.*/ + uint32_t *this_block; + + /**@brief Current trans_id we are on.*/ + uint32_t this_trans_id; +}; + +static int +jbd_revoke_entry_cmp(struct revoke_entry *a, struct revoke_entry *b) +{ + if (a->block > b->block) + return 1; + else if (a->block < b->block) + return -1; + return 0; +} + +static int +jbd_block_rec_cmp(struct jbd_block_rec *a, struct jbd_block_rec *b) +{ + if (a->lba > b->lba) + return 1; + else if (a->lba < b->lba) + return -1; + return 0; +} + +RB_GENERATE_INTERNAL(jbd_revoke, revoke_entry, revoke_node, + jbd_revoke_entry_cmp, static inline) +RB_GENERATE_INTERNAL(jbd_block, jbd_block_rec, block_rec_node, + jbd_block_rec_cmp, static inline) + +#define jbd_alloc_revoke_entry() calloc(1, sizeof(struct revoke_entry)) +#define jbd_free_revoke_entry(addr) free(addr) + +static int jbd_has_csum(struct jbd_sb *jbd_sb) +{ + if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V2)) + return 2; + + if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V3)) + return 3; + + return 0; +} + +#if CONFIG_META_CSUM_ENABLE +static uint32_t jbd_sb_csum(struct jbd_sb *jbd_sb) +{ + uint32_t checksum = 0; + + if (jbd_has_csum(jbd_sb)) { + uint32_t orig_checksum = jbd_sb->checksum; + jbd_set32(jbd_sb, checksum, 0); + /* Calculate crc32c checksum against tho whole superblock */ + checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_sb, + JBD_SUPERBLOCK_SIZE); + jbd_sb->checksum = orig_checksum; + } + return checksum; +} +#else +#define jbd_sb_csum(...) 0 +#endif + +static void jbd_sb_csum_set(struct jbd_sb *jbd_sb) +{ + if (!jbd_has_csum(jbd_sb)) + return; + + jbd_set32(jbd_sb, checksum, jbd_sb_csum(jbd_sb)); +} + +#if CONFIG_META_CSUM_ENABLE +static bool +jbd_verify_sb_csum(struct jbd_sb *jbd_sb) +{ + if (!jbd_has_csum(jbd_sb)) + return true; + + return jbd_sb_csum(jbd_sb) == jbd_get32(jbd_sb, checksum); +} +#else +#define jbd_verify_sb_csum(...) true +#endif -int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s) +#if CONFIG_META_CSUM_ENABLE +static uint32_t jbd_meta_csum(struct jbd_fs *jbd_fs, + struct jbd_bhdr *bhdr) +{ + uint32_t checksum = 0; + + if (jbd_has_csum(&jbd_fs->sb)) { + uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); + struct jbd_block_tail *tail = + (struct jbd_block_tail *)((char *)bhdr + block_size - + sizeof(struct jbd_block_tail)); + uint32_t orig_checksum = tail->checksum; + tail->checksum = 0; + + /* First calculate crc32c checksum against fs uuid */ + checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid, + sizeof(jbd_fs->sb.uuid)); + /* Calculate crc32c checksum against tho whole block */ + checksum = ext4_crc32c(checksum, bhdr, + block_size); + tail->checksum = orig_checksum; + } + return checksum; +} +#else +#define jbd_meta_csum(...) 0 +#endif + +static void jbd_meta_csum_set(struct jbd_fs *jbd_fs, + struct jbd_bhdr *bhdr) +{ + uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); + struct jbd_block_tail *tail = (struct jbd_block_tail *) + ((char *)bhdr + block_size - + sizeof(struct jbd_block_tail)); + if (!jbd_has_csum(&jbd_fs->sb)) + return; + + tail->checksum = to_be32(jbd_meta_csum(jbd_fs, bhdr)); +} + +#if CONFIG_META_CSUM_ENABLE +static bool +jbd_verify_meta_csum(struct jbd_fs *jbd_fs, + struct jbd_bhdr *bhdr) +{ + uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); + struct jbd_block_tail *tail = (struct jbd_block_tail *) + ((char *)bhdr + block_size - + sizeof(struct jbd_block_tail)); + if (!jbd_has_csum(&jbd_fs->sb)) + return true; + + return jbd_meta_csum(jbd_fs, bhdr) == to_be32(tail->checksum); +} +#else +#define jbd_verify_meta_csum(...) true +#endif + +#if CONFIG_META_CSUM_ENABLE +static uint32_t jbd_commit_csum(struct jbd_fs *jbd_fs, + struct jbd_commit_header *header) +{ + uint32_t checksum = 0; + + if (jbd_has_csum(&jbd_fs->sb)) { + uint32_t orig_checksum_type = header->chksum_type, + orig_checksum_size = header->chksum_size, + orig_checksum = header->chksum[0]; + uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); + header->chksum_type = 0; + header->chksum_size = 0; + header->chksum[0] = 0; + + /* First calculate crc32c checksum against fs uuid */ + checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid, + sizeof(jbd_fs->sb.uuid)); + /* Calculate crc32c checksum against tho whole block */ + checksum = ext4_crc32c(checksum, header, + block_size); + + header->chksum_type = orig_checksum_type; + header->chksum_size = orig_checksum_size; + header->chksum[0] = orig_checksum; + } + return checksum; +} +#else +#define jbd_commit_csum(...) 0 +#endif + +static void jbd_commit_csum_set(struct jbd_fs *jbd_fs, + struct jbd_commit_header *header) +{ + if (!jbd_has_csum(&jbd_fs->sb)) + return; + + header->chksum_type = 0; + header->chksum_size = 0; + header->chksum[0] = jbd_commit_csum(jbd_fs, header); +} + +#if CONFIG_META_CSUM_ENABLE +static bool jbd_verify_commit_csum(struct jbd_fs *jbd_fs, + struct jbd_commit_header *header) +{ + if (!jbd_has_csum(&jbd_fs->sb)) + return true; + + return header->chksum[0] == to_be32(jbd_commit_csum(jbd_fs, + header)); +} +#else +#define jbd_verify_commit_csum(...) true +#endif + +#if CONFIG_META_CSUM_ENABLE +/* + * NOTE: We only make use of @csum parameter when + * JBD_FEATURE_COMPAT_CHECKSUM is enabled. + */ +static uint32_t jbd_block_csum(struct jbd_fs *jbd_fs, const void *buf, + uint32_t csum, + uint32_t sequence) +{ + uint32_t checksum = 0; + + if (jbd_has_csum(&jbd_fs->sb)) { + uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); + /* First calculate crc32c checksum against fs uuid */ + checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid, + sizeof(jbd_fs->sb.uuid)); + /* Then calculate crc32c checksum against sequence no. */ + checksum = ext4_crc32c(checksum, &sequence, + sizeof(uint32_t)); + /* Calculate crc32c checksum against tho whole block */ + checksum = ext4_crc32c(checksum, buf, + block_size); + } else if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, + JBD_FEATURE_COMPAT_CHECKSUM)) { + uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); + /* Calculate crc32c checksum against tho whole block */ + checksum = ext4_crc32(csum, buf, + block_size); + } + return checksum; +} +#else +#define jbd_block_csum(...) 0 +#endif + +static void jbd_block_tag_csum_set(struct jbd_fs *jbd_fs, void *__tag, + uint32_t checksum) +{ + int ver = jbd_has_csum(&jbd_fs->sb); + if (!ver) + return; + + if (ver == 2) { + struct jbd_block_tag *tag = __tag; + tag->checksum = (uint16_t)to_be32(checksum); + } else { + struct jbd_block_tag3 *tag = __tag; + tag->checksum = to_be32(checksum); + } +} + +/**@brief Write jbd superblock to disk. + * @param jbd_fs jbd filesystem + * @param s jbd superblock + * @return standard error code*/ +static int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s) { int rc; struct ext4_fs *fs = jbd_fs->inode_ref.fs; @@ -30,12 +353,17 @@ int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s) if (rc != EOK) return rc; + jbd_sb_csum_set(s); offset = fblock * ext4_sb_get_block_size(&fs->sb); return ext4_block_writebytes(fs->bdev, offset, s, EXT4_SUPERBLOCK_SIZE); } -int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s) +/**@brief Read jbd superblock from disk. + * @param jbd_fs jbd filesystem + * @param s jbd superblock + * @return standard error code*/ +static int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s) { int rc; struct ext4_fs *fs = jbd_fs->inode_ref.fs; @@ -50,6 +378,9 @@ int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s) EXT4_SUPERBLOCK_SIZE); } +/**@brief Verify jbd superblock. + * @param sb jbd superblock + * @return true if jbd superblock is valid */ static bool jbd_verify_sb(struct jbd_sb *sb) { struct jbd_bhdr *header = &sb->header; @@ -60,9 +391,29 @@ static bool jbd_verify_sb(struct jbd_sb *sb) jbd_get32(header, blocktype) != JBD_SUPERBLOCK_V2) return false; - return true; + return jbd_verify_sb_csum(sb); +} + +/**@brief Write back dirty jbd superblock to disk. + * @param jbd_fs jbd filesystem + * @return standard error code*/ +static int jbd_write_sb(struct jbd_fs *jbd_fs) +{ + int rc = EOK; + if (jbd_fs->dirty) { + rc = jbd_sb_write(jbd_fs, &jbd_fs->sb); + if (rc != EOK) + return rc; + + jbd_fs->dirty = false; + } + return rc; } +/**@brief Get reference to jbd filesystem. + * @param fs Filesystem to load journal of + * @param jbd_fs jbd filesystem + * @return standard error code*/ int jbd_get_fs(struct ext4_fs *fs, struct jbd_fs *jbd_fs) { @@ -70,6 +421,9 @@ int jbd_get_fs(struct ext4_fs *fs, uint32_t journal_ino; memset(jbd_fs, 0, sizeof(struct jbd_fs)); + /* See if there is journal inode on this filesystem.*/ + /* FIXME: detection on existance ofbkejournal bdev is + * missing.*/ journal_ino = ext4_get32(&fs->sb, journal_inode_number); rc = ext4_fs_get_inode_ref(fs, @@ -83,23 +437,39 @@ int jbd_get_fs(struct ext4_fs *fs, if (rc != EOK) { memset(jbd_fs, 0, sizeof(struct jbd_fs)); ext4_fs_put_inode_ref(&jbd_fs->inode_ref); + return rc; + } + if (!jbd_verify_sb(&jbd_fs->sb)) { + memset(jbd_fs, 0, sizeof(struct jbd_fs)); + ext4_fs_put_inode_ref(&jbd_fs->inode_ref); + rc = EIO; } return rc; } +/**@brief Put reference of jbd filesystem. + * @param jbd_fs jbd filesystem + * @return standard error code*/ int jbd_put_fs(struct jbd_fs *jbd_fs) { - int rc; - rc = ext4_fs_put_inode_ref(&jbd_fs->inode_ref); + int rc = EOK; + rc = jbd_write_sb(jbd_fs); + + ext4_fs_put_inode_ref(&jbd_fs->inode_ref); return rc; } +/**@brief Data block lookup helper. + * @param jbd_fs jbd filesystem + * @param iblock block index + * @param fblock logical block address + * @return standard error code*/ int jbd_inode_bmap(struct jbd_fs *jbd_fs, ext4_lblk_t iblock, ext4_fsblk_t *fblock) { - int rc = ext4_fs_get_inode_data_block_index( + int rc = ext4_fs_get_inode_dblk_idx( &jbd_fs->inode_ref, iblock, fblock, @@ -107,13 +477,21 @@ int jbd_inode_bmap(struct jbd_fs *jbd_fs, return rc; } -int jbd_block_get(struct jbd_fs *jbd_fs, +/**@brief jbd block get function (through cache). + * @param jbd_fs jbd filesystem + * @param block block descriptor + * @param fblock jbd logical block address + * @return standard error code*/ +static int jbd_block_get(struct jbd_fs *jbd_fs, struct ext4_block *block, ext4_fsblk_t fblock) { /* TODO: journal device. */ int rc; ext4_lblk_t iblock = (ext4_lblk_t)fblock; + + /* Lookup the logical block address of + * fblock.*/ rc = jbd_inode_bmap(jbd_fs, iblock, &fblock); if (rc != EOK) @@ -121,10 +499,25 @@ int jbd_block_get(struct jbd_fs *jbd_fs, struct ext4_blockdev *bdev = jbd_fs->inode_ref.fs->bdev; rc = ext4_block_get(bdev, block, fblock); + + /* If succeeded, mark buffer as BC_FLUSH to indicate + * that data should be written to disk immediately.*/ + if (rc == EOK) { + ext4_bcache_set_flag(block->buf, BC_FLUSH); + /* As we don't want to occupy too much space + * in block cache, we set this buffer BC_TMP.*/ + ext4_bcache_set_flag(block->buf, BC_TMP); + } + return rc; } -int jbd_block_get_noread(struct jbd_fs *jbd_fs, +/**@brief jbd block get function (through cache, don't read). + * @param jbd_fs jbd filesystem + * @param block block descriptor + * @param fblock jbd logical block address + * @return standard error code*/ +static int jbd_block_get_noread(struct jbd_fs *jbd_fs, struct ext4_block *block, ext4_fsblk_t fblock) { @@ -138,29 +531,41 @@ int jbd_block_get_noread(struct jbd_fs *jbd_fs, struct ext4_blockdev *bdev = jbd_fs->inode_ref.fs->bdev; rc = ext4_block_get_noread(bdev, block, fblock); + if (rc == EOK) + ext4_bcache_set_flag(block->buf, BC_FLUSH); + return rc; } -int jbd_block_set(struct jbd_fs *jbd_fs, +/**@brief jbd block set procedure (through cache). + * @param jbd_fs jbd filesystem + * @param block block descriptor + * @return standard error code*/ +static int jbd_block_set(struct jbd_fs *jbd_fs, struct ext4_block *block) { return ext4_block_set(jbd_fs->inode_ref.fs->bdev, block); } -/* - * helper functions to deal with 32 or 64bit block numbers. - */ -int jbd_tag_bytes(struct jbd_fs *jbd_fs) +/**@brief helper functions to calculate + * block tag size, not including UUID part. + * @param jbd_fs jbd filesystem + * @return tag size in bytes*/ +static int jbd_tag_bytes(struct jbd_fs *jbd_fs) { int size; + /* It is very easy to deal with the case which + * JBD_FEATURE_INCOMPAT_CSUM_V3 is enabled.*/ if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V3)) return sizeof(struct jbd_block_tag3); size = sizeof(struct jbd_block_tag); + /* If JBD_FEATURE_INCOMPAT_CSUM_V2 is enabled, + * add 2 bytes to size.*/ if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V2)) size += sizeof(uint16_t); @@ -169,106 +574,234 @@ int jbd_tag_bytes(struct jbd_fs *jbd_fs) JBD_FEATURE_INCOMPAT_64BIT)) return size; + /* If block number is 4 bytes in size, + * minus 4 bytes from size */ return size - sizeof(uint32_t); } -static void +/**@brief Tag information. */ +struct tag_info { + /**@brief Tag size in bytes, including UUID part.*/ + int tag_bytes; + + /**@brief block number stored in this tag.*/ + ext4_fsblk_t block; + + /**@brief whether UUID part exists or not.*/ + bool uuid_exist; + + /**@brief UUID content if UUID part exists.*/ + uint8_t uuid[UUID_SIZE]; + + /**@brief Is this the last tag? */ + bool last_tag; + + /**@brief crc32c checksum. */ + uint32_t checksum; +}; + +/**@brief Extract information from a block tag. + * @param __tag pointer to the block tag + * @param tag_bytes block tag size of this jbd filesystem + * @param remaining size in buffer containing the block tag + * @param tag_info information of this tag. + * @return EOK when succeed, otherwise return EINVAL.*/ +static int jbd_extract_block_tag(struct jbd_fs *jbd_fs, - uint32_t tag_bytes, void *__tag, - ext4_fsblk_t *block, - bool *uuid_exist, - uint8_t *uuid, - bool *last_tag) + int tag_bytes, + int32_t remain_buf_size, + struct tag_info *tag_info) { char *uuid_start; - *uuid_exist = false; - *last_tag = false; + tag_info->tag_bytes = tag_bytes; + tag_info->uuid_exist = false; + tag_info->last_tag = false; + + /* See whether it is possible to hold a valid block tag.*/ + if (remain_buf_size - tag_bytes < 0) + return EINVAL; + if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V3)) { struct jbd_block_tag3 *tag = __tag; - *block = jbd_get32(tag, blocknr); + tag_info->block = jbd_get32(tag, blocknr); if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_64BIT)) - *block |= (uint64_t)jbd_get32(tag, blocknr_high) << 32; + tag_info->block |= + (uint64_t)jbd_get32(tag, blocknr_high) << 32; if (jbd_get32(tag, flags) & JBD_FLAG_ESCAPE) - *block = 0; + tag_info->block = 0; if (!(jbd_get32(tag, flags) & JBD_FLAG_SAME_UUID)) { + /* See whether it is possible to hold UUID part.*/ + if (remain_buf_size - tag_bytes < UUID_SIZE) + return EINVAL; + uuid_start = (char *)tag + tag_bytes; - *uuid_exist = true; - memcpy(uuid, uuid_start, UUID_SIZE); + tag_info->uuid_exist = true; + tag_info->tag_bytes += UUID_SIZE; + memcpy(tag_info->uuid, uuid_start, UUID_SIZE); } if (jbd_get32(tag, flags) & JBD_FLAG_LAST_TAG) - *last_tag = true; + tag_info->last_tag = true; } else { struct jbd_block_tag *tag = __tag; - *block = jbd_get32(tag, blocknr); + tag_info->block = jbd_get32(tag, blocknr); if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_64BIT)) - *block |= (uint64_t)jbd_get32(tag, blocknr_high) << 32; + tag_info->block |= + (uint64_t)jbd_get32(tag, blocknr_high) << 32; if (jbd_get16(tag, flags) & JBD_FLAG_ESCAPE) - *block = 0; + tag_info->block = 0; if (!(jbd_get16(tag, flags) & JBD_FLAG_SAME_UUID)) { + /* See whether it is possible to hold UUID part.*/ + if (remain_buf_size - tag_bytes < UUID_SIZE) + return EINVAL; + uuid_start = (char *)tag + tag_bytes; - *uuid_exist = true; - memcpy(uuid, uuid_start, UUID_SIZE); + tag_info->uuid_exist = true; + tag_info->tag_bytes += UUID_SIZE; + memcpy(tag_info->uuid, uuid_start, UUID_SIZE); } if (jbd_get16(tag, flags) & JBD_FLAG_LAST_TAG) - *last_tag = true; + tag_info->last_tag = true; + + } + return EOK; +} + +/**@brief Write information to a block tag. + * @param __tag pointer to the block tag + * @param remaining size in buffer containing the block tag + * @param tag_info information of this tag. + * @return EOK when succeed, otherwise return EINVAL.*/ +static int +jbd_write_block_tag(struct jbd_fs *jbd_fs, + void *__tag, + int32_t remain_buf_size, + struct tag_info *tag_info) +{ + char *uuid_start; + int tag_bytes = jbd_tag_bytes(jbd_fs); + + tag_info->tag_bytes = tag_bytes; + + /* See whether it is possible to hold a valid block tag.*/ + if (remain_buf_size - tag_bytes < 0) + return EINVAL; + + if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, + JBD_FEATURE_INCOMPAT_CSUM_V3)) { + struct jbd_block_tag3 *tag = __tag; + memset(tag, 0, sizeof(struct jbd_block_tag3)); + jbd_set32(tag, blocknr, tag_info->block); + if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, + JBD_FEATURE_INCOMPAT_64BIT)) + jbd_set32(tag, blocknr_high, tag_info->block >> 32); + + if (tag_info->uuid_exist) { + /* See whether it is possible to hold UUID part.*/ + if (remain_buf_size - tag_bytes < UUID_SIZE) + return EINVAL; + + uuid_start = (char *)tag + tag_bytes; + tag_info->tag_bytes += UUID_SIZE; + memcpy(uuid_start, tag_info->uuid, UUID_SIZE); + } else + jbd_set32(tag, flags, + jbd_get32(tag, flags) | JBD_FLAG_SAME_UUID); + + jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum); + + if (tag_info->last_tag) + jbd_set32(tag, flags, + jbd_get32(tag, flags) | JBD_FLAG_LAST_TAG); + + } else { + struct jbd_block_tag *tag = __tag; + memset(tag, 0, sizeof(struct jbd_block_tag)); + jbd_set32(tag, blocknr, tag_info->block); + if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, + JBD_FEATURE_INCOMPAT_64BIT)) + jbd_set32(tag, blocknr_high, tag_info->block >> 32); + + if (tag_info->uuid_exist) { + /* See whether it is possible to hold UUID part.*/ + if (remain_buf_size - tag_bytes < UUID_SIZE) + return EINVAL; + + uuid_start = (char *)tag + tag_bytes; + tag_info->tag_bytes += UUID_SIZE; + memcpy(uuid_start, tag_info->uuid, UUID_SIZE); + } else + jbd_set16(tag, flags, + jbd_get16(tag, flags) | JBD_FLAG_SAME_UUID); + + jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum); + + if (tag_info->last_tag) + jbd_set16(tag, flags, + jbd_get16(tag, flags) | JBD_FLAG_LAST_TAG); } + return EOK; } +/**@brief Iterate all block tags in a block. + * @param jbd_fs jbd filesystem + * @param __tag_start pointer to the block + * @param tag_tbl_size size of the block + * @param func callback routine to indicate that + * a block tag is found + * @param arg additional argument to be passed to func */ static void jbd_iterate_block_table(struct jbd_fs *jbd_fs, void *__tag_start, - uint32_t tag_tbl_size, + int32_t tag_tbl_size, void (*func)(struct jbd_fs * jbd_fs, ext4_fsblk_t block, uint8_t *uuid, void *arg), void *arg) { - ext4_fsblk_t block = 0; - uint8_t uuid[UUID_SIZE]; char *tag_start, *tag_ptr; - uint32_t tag_bytes = jbd_tag_bytes(jbd_fs); + int tag_bytes = jbd_tag_bytes(jbd_fs); tag_start = __tag_start; tag_ptr = tag_start; + /* Cut off the size of block tail storing checksum. */ if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V2) || JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V3)) tag_tbl_size -= sizeof(struct jbd_block_tail); - while (tag_ptr - tag_start + tag_bytes <= tag_tbl_size) { - bool uuid_exist; - bool last_tag; - jbd_extract_block_tag(jbd_fs, - tag_bytes, + while (tag_tbl_size) { + struct tag_info tag_info; + int rc = jbd_extract_block_tag(jbd_fs, tag_ptr, - &block, - &uuid_exist, - uuid, - &last_tag); + tag_bytes, + tag_tbl_size, + &tag_info); + if (rc != EOK) + break; + if (func) - func(jbd_fs, block, uuid, arg); + func(jbd_fs, tag_info.block, tag_info.uuid, arg); - if (last_tag) + /* Stop the iteration when we reach the last tag. */ + if (tag_info.last_tag) break; - tag_ptr += tag_bytes; - if (uuid_exist) - tag_ptr += UUID_SIZE; - + tag_ptr += tag_info.tag_bytes; + tag_tbl_size -= tag_info.tag_bytes; } } @@ -285,11 +818,124 @@ static void jbd_display_block_tags(struct jbd_fs *jbd_fs, return; } -struct revoke_entry { - ext4_fsblk_t block; - uint32_t trans_id; - RB_ENTRY(revoke_entry) revoke_node; -}; +static struct revoke_entry * +jbd_revoke_entry_lookup(struct recover_info *info, ext4_fsblk_t block) +{ + struct revoke_entry tmp = { + .block = block + }; + + return RB_FIND(jbd_revoke, &info->revoke_root, &tmp); +} + +/**@brief Replay a block in a transaction. + * @param jbd_fs jbd filesystem + * @param block block address to be replayed.*/ +static void jbd_replay_block_tags(struct jbd_fs *jbd_fs, + ext4_fsblk_t block, + uint8_t *uuid __unused, + void *__arg) +{ + int r; + struct replay_arg *arg = __arg; + struct recover_info *info = arg->info; + uint32_t *this_block = arg->this_block; + struct revoke_entry *revoke_entry; + struct ext4_block journal_block, ext4_block; + struct ext4_fs *fs = jbd_fs->inode_ref.fs; + + (*this_block)++; + + /* We replay this block only if the current transaction id + * is equal or greater than that in revoke entry.*/ + revoke_entry = jbd_revoke_entry_lookup(info, block); + if (revoke_entry && + arg->this_trans_id < revoke_entry->trans_id) + return; + + ext4_dbg(DEBUG_JBD, + "Replaying block in block_tag: %" PRIu64 "\n", + block); + + r = jbd_block_get(jbd_fs, &journal_block, *this_block); + if (r != EOK) + return; + + /* We need special treatment for ext4 superblock. */ + if (block) { + r = ext4_block_get_noread(fs->bdev, &ext4_block, block); + if (r != EOK) { + jbd_block_set(jbd_fs, &journal_block); + return; + } + + memcpy(ext4_block.data, + journal_block.data, + jbd_get32(&jbd_fs->sb, blocksize)); + + ext4_bcache_set_dirty(ext4_block.buf); + ext4_block_set(fs->bdev, &ext4_block); + } else { + uint16_t mount_count, state; + mount_count = ext4_get16(&fs->sb, mount_count); + state = ext4_get16(&fs->sb, state); + + memcpy(&fs->sb, + journal_block.data + EXT4_SUPERBLOCK_OFFSET, + EXT4_SUPERBLOCK_SIZE); + + /* Mark system as mounted */ + ext4_set16(&fs->sb, state, state); + r = ext4_sb_write(fs->bdev, &fs->sb); + if (r != EOK) + return; + + /*Update mount count*/ + ext4_set16(&fs->sb, mount_count, mount_count); + } + + jbd_block_set(jbd_fs, &journal_block); + + return; +} + +/**@brief Add block address to revoke tree, along with + * its transaction id. + * @param info journal replay info + * @param block block address to be replayed.*/ +static void jbd_add_revoke_block_tags(struct recover_info *info, + ext4_fsblk_t block) +{ + struct revoke_entry *revoke_entry; + + ext4_dbg(DEBUG_JBD, "Add block %" PRIu64 " to revoke tree\n", block); + /* If the revoke entry with respect to the block address + * exists already, update its transaction id.*/ + revoke_entry = jbd_revoke_entry_lookup(info, block); + if (revoke_entry) { + revoke_entry->trans_id = info->this_trans_id; + return; + } + + revoke_entry = jbd_alloc_revoke_entry(); + ext4_assert(revoke_entry); + revoke_entry->block = block; + revoke_entry->trans_id = info->this_trans_id; + RB_INSERT(jbd_revoke, &info->revoke_root, revoke_entry); + + return; +} + +static void jbd_destroy_revoke_tree(struct recover_info *info) +{ + while (!RB_EMPTY(&info->revoke_root)) { + struct revoke_entry *revoke_entry = + RB_MIN(jbd_revoke, &info->revoke_root); + ext4_assert(revoke_entry); + RB_REMOVE(jbd_revoke, &info->revoke_root, revoke_entry); + jbd_free_revoke_entry(revoke_entry); + } +} /* Make sure we wrap around the log correctly! */ #define wrap(sb, var) \ @@ -302,27 +948,42 @@ do { \ #define ACTION_REVOKE 1 #define ACTION_RECOVER 2 -struct recover_info { - uint32_t start_trans_id; - uint32_t last_trans_id; - RB_HEAD(jbd_revoke, revoke_entry) revoke_root; -}; - -static void jbd_build_revoke_root(struct jbd_fs *jbd_fs, +/**@brief Add entries in a revoke block to revoke tree. + * @param jbd_fs jbd filesystem + * @param header revoke block header + * @param recover_info journal replay info*/ +static void jbd_build_revoke_tree(struct jbd_fs *jbd_fs, struct jbd_bhdr *header, struct recover_info *info) { + char *blocks_entry; struct jbd_revoke_header *revoke_hdr = (struct jbd_revoke_header *)header; + uint32_t i, nr_entries, record_len = 4; - jbd_iterate_block_table(jbd_fs, - revoke_hdr + 1, - jbd_get32(&jbd_fs->sb, blocksize) - - sizeof(struct jbd_revoke_header), - jbd_display_block_tags, - NULL); - - (void)info; + /* If we are working on a 64bit jbd filesystem, */ + if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, + JBD_FEATURE_INCOMPAT_64BIT)) + record_len = 8; + + nr_entries = (jbd_get32(revoke_hdr, count) - + sizeof(struct jbd_revoke_header)) / + record_len; + + blocks_entry = (char *)(revoke_hdr + 1); + + for (i = 0;i < nr_entries;i++) { + if (record_len == 8) { + uint64_t *blocks = + (uint64_t *)blocks_entry; + jbd_add_revoke_block_tags(info, to_be64(*blocks)); + } else { + uint32_t *blocks = + (uint32_t *)blocks_entry; + jbd_add_revoke_block_tags(info, to_be32(*blocks)); + } + blocks_entry += record_len; + } } static void jbd_debug_descriptor_block(struct jbd_fs *jbd_fs, @@ -337,9 +998,26 @@ static void jbd_debug_descriptor_block(struct jbd_fs *jbd_fs, iblock); } -int jbd_iterate_log(struct jbd_fs *jbd_fs, - struct recover_info *info, - int action) +static void jbd_replay_descriptor_block(struct jbd_fs *jbd_fs, + struct jbd_bhdr *header, + struct replay_arg *arg) +{ + jbd_iterate_block_table(jbd_fs, + header + 1, + jbd_get32(&jbd_fs->sb, blocksize) - + sizeof(struct jbd_bhdr), + jbd_replay_block_tags, + arg); +} + +/**@brief The core routine of journal replay. + * @param jbd_fs jbd filesystem + * @param recover_info journal replay info + * @param action action needed to be taken + * @return standard error code*/ +static int jbd_iterate_log(struct jbd_fs *jbd_fs, + struct recover_info *info, + int action) { int r = EOK; bool log_end = false; @@ -347,8 +1025,13 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs, uint32_t start_trans_id, this_trans_id; uint32_t start_block, this_block; + /* We start iterating valid blocks in the whole journal.*/ start_trans_id = this_trans_id = jbd_get32(sb, sequence); start_block = this_block = jbd_get32(sb, start); + if (action == ACTION_SCAN) + info->trans_cnt = 0; + else if (!info->trans_cnt) + log_end = true; ext4_dbg(DEBUG_JBD, "Start of journal at trans id: %" PRIu32 "\n", start_trans_id); @@ -356,6 +1039,10 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs, while (!log_end) { struct ext4_block block; struct jbd_bhdr *header; + /* If we are not scanning for the last + * valid transaction in the journal, + * we will stop when we reach the end of + * the journal.*/ if (action != ACTION_SCAN) if (this_trans_id > info->last_trans_id) { log_end = true; @@ -367,12 +1054,19 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs, break; header = (struct jbd_bhdr *)block.data; + /* This block does not have a valid magic number, + * so we have reached the end of the journal.*/ if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER) { jbd_block_set(jbd_fs, &block); log_end = true; continue; } + /* If the transaction id we found is not expected, + * we may have reached the end of the journal. + * + * If we are not scanning the journal, something + * bad might have taken place. :-( */ if (jbd_get32(header, sequence) != this_trans_id) { if (action != ACTION_SCAN) r = EIO; @@ -384,22 +1078,66 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs, switch (jbd_get32(header, blocktype)) { case JBD_DESCRIPTOR_BLOCK: - ext4_dbg(DEBUG_JBD, "Descriptor block: %u, " - "trans_id: %u\n", + if (!jbd_verify_meta_csum(jbd_fs, header)) { + ext4_dbg(DEBUG_JBD, + DBG_WARN "Descriptor block checksum failed." + "Journal block: %" PRIu32"\n", + this_block); + log_end = true; + break; + } + ext4_dbg(DEBUG_JBD, "Descriptor block: %" PRIu32", " + "trans_id: %" PRIu32"\n", this_block, this_trans_id); - jbd_debug_descriptor_block(jbd_fs, header, &this_block); + if (action == ACTION_RECOVER) { + struct replay_arg replay_arg; + replay_arg.info = info; + replay_arg.this_block = &this_block; + replay_arg.this_trans_id = this_trans_id; + + jbd_replay_descriptor_block(jbd_fs, + header, &replay_arg); + } else + jbd_debug_descriptor_block(jbd_fs, + header, &this_block); + break; case JBD_COMMIT_BLOCK: - ext4_dbg(DEBUG_JBD, "Commit block: %u, " - "trans_id: %u\n", + if (!jbd_verify_commit_csum(jbd_fs, + (struct jbd_commit_header *)header)) { + ext4_dbg(DEBUG_JBD, + DBG_WARN "Commit block checksum failed." + "Journal block: %" PRIu32"\n", + this_block); + log_end = true; + break; + } + ext4_dbg(DEBUG_JBD, "Commit block: %" PRIu32", " + "trans_id: %" PRIu32"\n", this_block, this_trans_id); + /* This is the end of a transaction, + * we may now proceed to the next transaction. + */ this_trans_id++; + info->trans_cnt++; break; case JBD_REVOKE_BLOCK: - ext4_dbg(DEBUG_JBD, "Revoke block: %u, " - "trans_id: %u\n", + if (!jbd_verify_meta_csum(jbd_fs, header)) { + ext4_dbg(DEBUG_JBD, + DBG_WARN "Revoke block checksum failed." + "Journal block: %" PRIu32"\n", + this_block); + log_end = true; + break; + } + ext4_dbg(DEBUG_JBD, "Revoke block: %" PRIu32", " + "trans_id: %" PRIu32"\n", this_block, this_trans_id); - jbd_build_revoke_root(jbd_fs, header, info); + if (action == ACTION_REVOKE) { + info->this_trans_id = this_trans_id; + jbd_build_revoke_tree(jbd_fs, + header, info); + } break; default: log_end = true; @@ -414,6 +1152,7 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs, } ext4_dbg(DEBUG_JBD, "End of journal.\n"); if (r == EOK && action == ACTION_SCAN) { + /* We have finished scanning the journal. */ info->start_trans_id = start_trans_id; if (this_trans_id > start_trans_id) info->last_trans_id = this_trans_id - 1; @@ -424,6 +1163,9 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs, return r; } +/**@brief Replay journal. + * @param jbd_fs jbd filesystem + * @return standard error code*/ int jbd_recover(struct jbd_fs *jbd_fs) { int r; @@ -432,6 +1174,985 @@ int jbd_recover(struct jbd_fs *jbd_fs) if (!sb->start) return EOK; + RB_INIT(&info.revoke_root); + r = jbd_iterate_log(jbd_fs, &info, ACTION_SCAN); + if (r != EOK) + return r; + + r = jbd_iterate_log(jbd_fs, &info, ACTION_REVOKE); + if (r != EOK) + return r; + + r = jbd_iterate_log(jbd_fs, &info, ACTION_RECOVER); + if (r == EOK) { + /* If we successfully replay the journal, + * clear EXT4_FINCOM_RECOVER flag on the + * ext4 superblock, and set the start of + * journal to 0.*/ + uint32_t features_incompatible = + ext4_get32(&jbd_fs->inode_ref.fs->sb, + features_incompatible); + jbd_set32(&jbd_fs->sb, start, 0); + features_incompatible &= ~EXT4_FINCOM_RECOVER; + ext4_set32(&jbd_fs->inode_ref.fs->sb, + features_incompatible, + features_incompatible); + jbd_fs->dirty = true; + r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev, + &jbd_fs->inode_ref.fs->sb); + } + jbd_destroy_revoke_tree(&info); + return r; +} + +static void jbd_journal_write_sb(struct jbd_journal *journal) +{ + struct jbd_fs *jbd_fs = journal->jbd_fs; + jbd_set32(&jbd_fs->sb, start, journal->start); + jbd_set32(&jbd_fs->sb, sequence, journal->trans_id); + jbd_fs->dirty = true; +} + +/**@brief Start accessing the journal. + * @param jbd_fs jbd filesystem + * @param journal current journal session + * @return standard error code*/ +int jbd_journal_start(struct jbd_fs *jbd_fs, + struct jbd_journal *journal) +{ + int r; + uint32_t features_incompatible = + ext4_get32(&jbd_fs->inode_ref.fs->sb, + features_incompatible); + struct ext4_block block = EXT4_BLOCK_ZERO(); + features_incompatible |= EXT4_FINCOM_RECOVER; + ext4_set32(&jbd_fs->inode_ref.fs->sb, + features_incompatible, + features_incompatible); + r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev, + &jbd_fs->inode_ref.fs->sb); + if (r != EOK) + return r; + + journal->first = jbd_get32(&jbd_fs->sb, first); + journal->start = journal->first; + journal->last = journal->first; + journal->trans_id = 1; + journal->alloc_trans_id = 1; + + journal->block_size = jbd_get32(&jbd_fs->sb, blocksize); + + r = jbd_block_get_noread(jbd_fs, + &block, + journal->start); + if (r != EOK) { + memset(journal, 0, sizeof(struct jbd_journal)); + return r; + } + memset(block.data, 0, journal->block_size); + ext4_bcache_set_dirty(block.buf); + r = jbd_block_set(jbd_fs, &block); + if (r != EOK) { + memset(journal, 0, sizeof(struct jbd_journal)); + return r; + } + + TAILQ_INIT(&journal->trans_queue); + TAILQ_INIT(&journal->cp_queue); + RB_INIT(&journal->block_rec_root); + journal->jbd_fs = jbd_fs; + jbd_journal_write_sb(journal); + return jbd_write_sb(jbd_fs); +} + +static void jbd_trans_end_write(struct ext4_bcache *bc __unused, + struct ext4_buf *buf __unused, + int res, + void *arg); + +static void jbd_journal_flush_trans(struct jbd_trans *trans) +{ + struct jbd_buf *jbd_buf, *tmp; + struct jbd_journal *journal = trans->journal; + struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; + void *tmp_data = malloc(journal->block_size); + ext4_assert(tmp_data); + + TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, + tmp) { + struct ext4_buf *buf = jbd_buf->block_rec->buf; + /* The buffer in memory is still dirty. */ + if (buf) { + if (jbd_buf->block_rec->trans != trans) { + int r; + struct ext4_block jbd_block = EXT4_BLOCK_ZERO(); + ext4_assert(ext4_block_get(fs->bdev, + &jbd_block, + jbd_buf->jbd_lba) == EOK); + memcpy(tmp_data, jbd_block.data, + journal->block_size); + ext4_block_set(fs->bdev, &jbd_block); + r = ext4_blocks_set_direct(fs->bdev, tmp_data, + buf->lba, 1); + jbd_trans_end_write(fs->bdev->bc, buf, r, jbd_buf); + } else + ext4_block_flush_buf(fs->bdev, buf); + + } + } + + free(tmp_data); +} + +static void +jbd_journal_skip_pure_revoke(struct jbd_journal *journal, + struct jbd_trans *trans) +{ + journal->start = trans->start_iblock + + trans->alloc_blocks; + wrap(&journal->jbd_fs->sb, journal->start); + journal->trans_id = trans->trans_id + 1; + jbd_journal_free_trans(journal, + trans, false); + jbd_journal_write_sb(journal); +} + +static void +jbd_journal_purge_cp_trans(struct jbd_journal *journal, + bool flush) +{ + struct jbd_trans *trans; + while ((trans = TAILQ_FIRST(&journal->cp_queue))) { + if (!trans->data_cnt) { + TAILQ_REMOVE(&journal->cp_queue, + trans, + trans_node); + jbd_journal_skip_pure_revoke(journal, trans); + } else { + if (trans->data_cnt == + trans->written_cnt) { + journal->start = + trans->start_iblock + + trans->alloc_blocks; + wrap(&journal->jbd_fs->sb, + journal->start); + journal->trans_id = + trans->trans_id + 1; + TAILQ_REMOVE(&journal->cp_queue, + trans, + trans_node); + jbd_journal_free_trans(journal, + trans, + false); + jbd_journal_write_sb(journal); + } else if (!flush) { + journal->start = + trans->start_iblock; + wrap(&journal->jbd_fs->sb, + journal->start); + journal->trans_id = + trans->trans_id; + jbd_journal_write_sb(journal); + break; + } else + jbd_journal_flush_trans(trans); + } + } +} + +/**@brief Stop accessing the journal. + * @param journal current journal session + * @return standard error code*/ +int jbd_journal_stop(struct jbd_journal *journal) +{ + int r; + struct jbd_fs *jbd_fs = journal->jbd_fs; + uint32_t features_incompatible; + + /* Make sure that journalled content have reached + * the disk.*/ + jbd_journal_purge_cp_trans(journal, true); + + /* There should be no block record in this journal + * session. */ + if (!RB_EMPTY(&journal->block_rec_root)) + ext4_dbg(DEBUG_JBD, + DBG_WARN "There are still block records " + "in this journal session!\n"); + + features_incompatible = + ext4_get32(&jbd_fs->inode_ref.fs->sb, + features_incompatible); + features_incompatible &= ~EXT4_FINCOM_RECOVER; + ext4_set32(&jbd_fs->inode_ref.fs->sb, + features_incompatible, + features_incompatible); + r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev, + &jbd_fs->inode_ref.fs->sb); + if (r != EOK) + return r; + + journal->start = 0; + journal->trans_id = 0; + jbd_journal_write_sb(journal); + return jbd_write_sb(journal->jbd_fs); +} + +/**@brief Allocate a block in the journal. + * @param journal current journal session + * @param trans transaction + * @return allocated block address*/ +static uint32_t jbd_journal_alloc_block(struct jbd_journal *journal, + struct jbd_trans *trans) +{ + uint32_t start_block; + + start_block = journal->last++; + trans->alloc_blocks++; + wrap(&journal->jbd_fs->sb, journal->last); + + /* If there is no space left, flush all journalled + * blocks to disk first.*/ + if (journal->last == journal->start) + jbd_journal_purge_cp_trans(journal, true); + + return start_block; +} + +/**@brief Allocate a new transaction + * @param journal current journal session + * @return transaction allocated*/ +struct jbd_trans * +jbd_journal_new_trans(struct jbd_journal *journal) +{ + struct jbd_trans *trans = calloc(1, sizeof(struct jbd_trans)); + if (!trans) + return NULL; + + /* We will assign a trans_id to this transaction, + * once it has been committed.*/ + trans->journal = journal; + trans->data_csum = EXT4_CRC32_INIT; + trans->error = EOK; + TAILQ_INIT(&trans->buf_queue); + return trans; +} + +/**@brief gain access to it before making any modications. + * @param journal current journal session + * @param trans transaction + * @param block descriptor + * @return standard error code.*/ +int jbd_trans_get_access(struct jbd_journal *journal, + struct jbd_trans *trans, + struct ext4_block *block) +{ + int r = EOK; + struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; + struct jbd_buf *jbd_buf = block->buf->end_write_arg; + + /* If the buffer has already been modified, we should + * flush dirty data in this buffer to disk.*/ + if (ext4_bcache_test_flag(block->buf, BC_DIRTY) && + block->buf->end_write == jbd_trans_end_write) { + ext4_assert(jbd_buf); + if (jbd_buf->trans != trans) + r = ext4_block_flush_buf(fs->bdev, block->buf); + + } return r; } + +static struct jbd_block_rec * +jbd_trans_block_rec_lookup(struct jbd_journal *journal, + ext4_fsblk_t lba) +{ + struct jbd_block_rec tmp = { + .lba = lba + }; + + return RB_FIND(jbd_block, + &journal->block_rec_root, + &tmp); +} + +static void +jbd_trans_change_ownership(struct jbd_block_rec *block_rec, + struct jbd_trans *new_trans, + struct ext4_buf *new_buf) +{ + LIST_REMOVE(block_rec, tbrec_node); + /* Now this block record belongs to this transaction. */ + LIST_INSERT_HEAD(&new_trans->tbrec_list, block_rec, tbrec_node); + block_rec->trans = new_trans; + block_rec->buf = new_buf; +} + +static inline struct jbd_block_rec * +jbd_trans_insert_block_rec(struct jbd_trans *trans, + ext4_fsblk_t lba, + struct ext4_buf *buf) +{ + struct jbd_block_rec *block_rec; + block_rec = jbd_trans_block_rec_lookup(trans->journal, lba); + if (block_rec) { + jbd_trans_change_ownership(block_rec, trans, buf); + return block_rec; + } + block_rec = calloc(1, sizeof(struct jbd_block_rec)); + if (!block_rec) + return NULL; + + block_rec->lba = lba; + block_rec->buf = buf; + block_rec->trans = trans; + TAILQ_INIT(&block_rec->dirty_buf_queue); + LIST_INSERT_HEAD(&trans->tbrec_list, block_rec, tbrec_node); + RB_INSERT(jbd_block, &trans->journal->block_rec_root, block_rec); + return block_rec; +} + +static void +jbd_trans_finish_callback(struct jbd_journal *journal, + const struct jbd_trans *trans, + struct jbd_block_rec *block_rec, + bool abort) +{ + struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; + if (block_rec->trans != trans) + return; + + if (!abort) { + struct jbd_buf *jbd_buf, *tmp; + TAILQ_FOREACH_SAFE(jbd_buf, + &block_rec->dirty_buf_queue, + dirty_buf_node, + tmp) { + /* All we need is a fake ext4_buf. */ + struct ext4_buf buf; + + jbd_trans_end_write(fs->bdev->bc, + &buf, + EOK, + jbd_buf); + } + } else { + struct jbd_buf *jbd_buf; + struct ext4_block jbd_block = EXT4_BLOCK_ZERO(), + block = EXT4_BLOCK_ZERO(); + jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue, + jbd_buf_dirty); + if (jbd_buf) { + ext4_assert(ext4_block_get(fs->bdev, + &jbd_block, + jbd_buf->jbd_lba) == EOK); + ext4_assert(ext4_block_get_noread(fs->bdev, + &block, + block_rec->lba) == EOK); + memcpy(block.data, jbd_block.data, + journal->block_size); + + jbd_trans_change_ownership(block_rec, + jbd_buf->trans, block.buf); + + block.buf->end_write = jbd_trans_end_write; + block.buf->end_write_arg = jbd_buf; + + ext4_bcache_set_flag(jbd_block.buf, BC_TMP); + ext4_bcache_set_dirty(block.buf); + + ext4_block_set(fs->bdev, &jbd_block); + ext4_block_set(fs->bdev, &block); + return; + } + } +} + +static inline void +jbd_trans_remove_block_rec(struct jbd_journal *journal, + struct jbd_block_rec *block_rec, + struct jbd_trans *trans) +{ + /* If this block record doesn't belong to this transaction, + * give up.*/ + if (block_rec->trans == trans) { + LIST_REMOVE(block_rec, tbrec_node); + RB_REMOVE(jbd_block, + &journal->block_rec_root, + block_rec); + free(block_rec); + } +} + +/**@brief Add block to a transaction and mark it dirty. + * @param trans transaction + * @param block block descriptor + * @return standard error code*/ +int jbd_trans_set_block_dirty(struct jbd_trans *trans, + struct ext4_block *block) +{ + struct jbd_buf *buf; + + struct jbd_block_rec *block_rec; + if (block->buf->end_write == jbd_trans_end_write) { + buf = block->buf->end_write_arg; + if (buf && buf->trans == trans) + return EOK; + } + buf = calloc(1, sizeof(struct jbd_buf)); + if (!buf) + return ENOMEM; + + if ((block_rec = jbd_trans_insert_block_rec(trans, + block->lb_id, + block->buf)) == NULL) { + free(buf); + return ENOMEM; + } + + TAILQ_INSERT_TAIL(&block_rec->dirty_buf_queue, + buf, + dirty_buf_node); + + buf->block_rec = block_rec; + buf->trans = trans; + buf->block = *block; + ext4_bcache_inc_ref(block->buf); + + /* If the content reach the disk, notify us + * so that we may do a checkpoint. */ + block->buf->end_write = jbd_trans_end_write; + block->buf->end_write_arg = buf; + + trans->data_cnt++; + TAILQ_INSERT_HEAD(&trans->buf_queue, buf, buf_node); + + ext4_bcache_set_dirty(block->buf); + return EOK; +} + +/**@brief Add block to be revoked to a transaction + * @param trans transaction + * @param lba logical block address + * @return standard error code*/ +int jbd_trans_revoke_block(struct jbd_trans *trans, + ext4_fsblk_t lba) +{ + struct jbd_revoke_rec *rec = + calloc(1, sizeof(struct jbd_revoke_rec)); + if (!rec) + return ENOMEM; + + rec->lba = lba; + LIST_INSERT_HEAD(&trans->revoke_list, rec, revoke_node); + return EOK; +} + +/**@brief Try to add block to be revoked to a transaction. + * If @lba still remains in an transaction on checkpoint + * queue, add @lba as a revoked block to the transaction. + * @param trans transaction + * @param lba logical block address + * @return standard error code*/ +int jbd_trans_try_revoke_block(struct jbd_trans *trans, + ext4_fsblk_t lba) +{ + int r = EOK; + struct jbd_journal *journal = trans->journal; + struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; + struct jbd_block_rec *block_rec = + jbd_trans_block_rec_lookup(journal, lba); + + /* Make sure we don't flush any buffers belong to this transaction. */ + if (block_rec && block_rec->trans != trans) { + /* If the buffer has not been flushed yet, flush it now. */ + if (block_rec->buf) { + r = ext4_block_flush_buf(fs->bdev, block_rec->buf); + if (r != EOK) + return r; + + } + + jbd_trans_revoke_block(trans, lba); + } + + return EOK; +} + +/**@brief Free a transaction + * @param journal current journal session + * @param trans transaction + * @param abort discard all the modifications on the block? + * @return standard error code*/ +void jbd_journal_free_trans(struct jbd_journal *journal, + struct jbd_trans *trans, + bool abort) +{ + struct jbd_buf *jbd_buf, *tmp; + struct jbd_revoke_rec *rec, *tmp2; + struct jbd_block_rec *block_rec, *tmp3; + struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; + TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, + tmp) { + block_rec = jbd_buf->block_rec; + if (abort) { + jbd_buf->block.buf->end_write = NULL; + jbd_buf->block.buf->end_write_arg = NULL; + ext4_bcache_clear_dirty(jbd_buf->block.buf); + ext4_block_set(fs->bdev, &jbd_buf->block); + } + + TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue, + jbd_buf, + dirty_buf_node); + jbd_trans_finish_callback(journal, + trans, + block_rec, + abort); + TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node); + free(jbd_buf); + } + LIST_FOREACH_SAFE(rec, &trans->revoke_list, revoke_node, + tmp2) { + LIST_REMOVE(rec, revoke_node); + free(rec); + } + LIST_FOREACH_SAFE(block_rec, &trans->tbrec_list, tbrec_node, + tmp3) { + jbd_trans_remove_block_rec(journal, block_rec, trans); + } + + free(trans); +} + +/**@brief Write commit block for a transaction + * @param trans transaction + * @return standard error code*/ +static int jbd_trans_write_commit_block(struct jbd_trans *trans) +{ + int rc; + struct jbd_commit_header *header; + uint32_t commit_iblock = 0; + struct ext4_block commit_block; + struct jbd_journal *journal = trans->journal; + + commit_iblock = jbd_journal_alloc_block(journal, trans); + rc = jbd_block_get_noread(journal->jbd_fs, + &commit_block, commit_iblock); + if (rc != EOK) + return rc; + + header = (struct jbd_commit_header *)commit_block.data; + jbd_set32(&header->header, magic, JBD_MAGIC_NUMBER); + jbd_set32(&header->header, blocktype, JBD_COMMIT_BLOCK); + jbd_set32(&header->header, sequence, trans->trans_id); + + if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb, + JBD_FEATURE_COMPAT_CHECKSUM)) { + jbd_set32(header, chksum_type, JBD_CRC32_CHKSUM); + jbd_set32(header, chksum_size, JBD_CRC32_CHKSUM_SIZE); + jbd_set32(header, chksum[0], trans->data_csum); + } + jbd_commit_csum_set(journal->jbd_fs, header); + ext4_bcache_set_dirty(commit_block.buf); + rc = jbd_block_set(journal->jbd_fs, &commit_block); + if (rc != EOK) + return rc; + + return EOK; +} + +/**@brief Write descriptor block for a transaction + * @param journal current journal session + * @param trans transaction + * @return standard error code*/ +static int jbd_journal_prepare(struct jbd_journal *journal, + struct jbd_trans *trans) +{ + int rc = EOK, i = 0; + int32_t tag_tbl_size; + uint32_t desc_iblock = 0; + uint32_t data_iblock = 0; + char *tag_start = NULL, *tag_ptr = NULL; + struct jbd_buf *jbd_buf, *tmp; + struct ext4_block desc_block, data_block; + struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; + uint32_t checksum = EXT4_CRC32_INIT; + + /* Try to remove any non-dirty buffers from the tail of + * buf_queue. */ + TAILQ_FOREACH_REVERSE_SAFE(jbd_buf, &trans->buf_queue, + jbd_trans_buf, buf_node, tmp) { + /* We stop the iteration when we find a dirty buffer. */ + if (ext4_bcache_test_flag(jbd_buf->block.buf, + BC_DIRTY)) + break; + + TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue, + jbd_buf, + dirty_buf_node); + + jbd_buf->block.buf->end_write = NULL; + jbd_buf->block.buf->end_write_arg = NULL; + jbd_trans_finish_callback(journal, + trans, + jbd_buf->block_rec, + true); + + /* The buffer has not been modified, just release + * that jbd_buf. */ + jbd_trans_remove_block_rec(journal, + jbd_buf->block_rec, trans); + trans->data_cnt--; + + ext4_block_set(fs->bdev, &jbd_buf->block); + TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node); + free(jbd_buf); + } + + TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, tmp) { + struct tag_info tag_info; + bool uuid_exist = false; + if (!ext4_bcache_test_flag(jbd_buf->block.buf, + BC_DIRTY)) { + TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue, + jbd_buf, + dirty_buf_node); + + jbd_buf->block.buf->end_write = NULL; + jbd_buf->block.buf->end_write_arg = NULL; + jbd_trans_finish_callback(journal, + trans, + jbd_buf->block_rec, + true); + + /* The buffer has not been modified, just release + * that jbd_buf. */ + jbd_trans_remove_block_rec(journal, + jbd_buf->block_rec, trans); + trans->data_cnt--; + + ext4_block_set(fs->bdev, &jbd_buf->block); + TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node); + free(jbd_buf); + continue; + } + checksum = jbd_block_csum(journal->jbd_fs, + jbd_buf->block.data, + checksum, + trans->trans_id); +again: + if (!desc_iblock) { + struct jbd_bhdr *bhdr; + desc_iblock = jbd_journal_alloc_block(journal, trans); + rc = jbd_block_get_noread(journal->jbd_fs, + &desc_block, desc_iblock); + if (rc != EOK) + break; + + ext4_bcache_set_dirty(desc_block.buf); + + bhdr = (struct jbd_bhdr *)desc_block.data; + jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER); + jbd_set32(bhdr, blocktype, JBD_DESCRIPTOR_BLOCK); + jbd_set32(bhdr, sequence, trans->trans_id); + + tag_start = (char *)(bhdr + 1); + tag_ptr = tag_start; + uuid_exist = true; + tag_tbl_size = journal->block_size - + sizeof(struct jbd_bhdr); + + if (jbd_has_csum(&journal->jbd_fs->sb)) + tag_tbl_size -= sizeof(struct jbd_block_tail); + + if (!trans->start_iblock) + trans->start_iblock = desc_iblock; + + } + tag_info.block = jbd_buf->block.lb_id; + tag_info.uuid_exist = uuid_exist; + if (i == trans->data_cnt - 1) + tag_info.last_tag = true; + else + tag_info.last_tag = false; + + tag_info.checksum = checksum; + + if (uuid_exist) + memcpy(tag_info.uuid, journal->jbd_fs->sb.uuid, + UUID_SIZE); + + rc = jbd_write_block_tag(journal->jbd_fs, + tag_ptr, + tag_tbl_size, + &tag_info); + if (rc != EOK) { + jbd_meta_csum_set(journal->jbd_fs, + (struct jbd_bhdr *)desc_block.data); + jbd_block_set(journal->jbd_fs, &desc_block); + desc_iblock = 0; + goto again; + } + + data_iblock = jbd_journal_alloc_block(journal, trans); + rc = jbd_block_get_noread(journal->jbd_fs, + &data_block, data_iblock); + if (rc != EOK) + break; + + ext4_bcache_set_dirty(data_block.buf); + + memcpy(data_block.data, jbd_buf->block.data, + journal->block_size); + jbd_buf->jbd_lba = data_block.lb_id; + + rc = jbd_block_set(journal->jbd_fs, &data_block); + if (rc != EOK) + break; + + tag_ptr += tag_info.tag_bytes; + tag_tbl_size -= tag_info.tag_bytes; + + i++; + } + if (rc == EOK && desc_iblock) { + jbd_meta_csum_set(journal->jbd_fs, + (struct jbd_bhdr *)desc_block.data); + trans->data_csum = checksum; + jbd_block_set(journal->jbd_fs, &desc_block); + } + + return rc; +} + +/**@brief Write revoke block for a transaction + * @param journal current journal session + * @param trans transaction + * @return standard error code*/ +static int +jbd_journal_prepare_revoke(struct jbd_journal *journal, + struct jbd_trans *trans) +{ + int rc = EOK, i = 0; + int32_t tag_tbl_size; + uint32_t desc_iblock = 0; + char *blocks_entry = NULL; + struct jbd_revoke_rec *rec, *tmp; + struct ext4_block desc_block; + struct jbd_revoke_header *header = NULL; + int32_t record_len = 4; + + if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb, + JBD_FEATURE_INCOMPAT_64BIT)) + record_len = 8; + + LIST_FOREACH_SAFE(rec, &trans->revoke_list, revoke_node, + tmp) { +again: + if (!desc_iblock) { + struct jbd_bhdr *bhdr; + desc_iblock = jbd_journal_alloc_block(journal, trans); + rc = jbd_block_get_noread(journal->jbd_fs, + &desc_block, desc_iblock); + if (rc != EOK) { + break; + } + + ext4_bcache_set_dirty(desc_block.buf); + + bhdr = (struct jbd_bhdr *)desc_block.data; + jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER); + jbd_set32(bhdr, blocktype, JBD_REVOKE_BLOCK); + jbd_set32(bhdr, sequence, trans->trans_id); + + header = (struct jbd_revoke_header *)bhdr; + blocks_entry = (char *)(header + 1); + tag_tbl_size = journal->block_size - + sizeof(struct jbd_revoke_header); + + if (jbd_has_csum(&journal->jbd_fs->sb)) + tag_tbl_size -= sizeof(struct jbd_block_tail); + + if (!trans->start_iblock) + trans->start_iblock = desc_iblock; + + } + + if (tag_tbl_size < record_len) { + jbd_set32(header, count, + journal->block_size - tag_tbl_size); + jbd_meta_csum_set(journal->jbd_fs, + (struct jbd_bhdr *)desc_block.data); + jbd_block_set(journal->jbd_fs, &desc_block); + desc_iblock = 0; + header = NULL; + goto again; + } + if (record_len == 8) { + uint64_t *blocks = + (uint64_t *)blocks_entry; + *blocks = to_be64(rec->lba); + } else { + uint32_t *blocks = + (uint32_t *)blocks_entry; + *blocks = to_be32(rec->lba); + } + blocks_entry += record_len; + tag_tbl_size -= record_len; + + i++; + } + if (rc == EOK && desc_iblock) { + if (header != NULL) + jbd_set32(header, count, + journal->block_size - tag_tbl_size); + + jbd_meta_csum_set(journal->jbd_fs, + (struct jbd_bhdr *)desc_block.data); + jbd_block_set(journal->jbd_fs, &desc_block); + } + + return rc; +} + +/**@brief Put references of block descriptors in a transaction. + * @param journal current journal session + * @param trans transaction*/ +void jbd_journal_cp_trans(struct jbd_journal *journal, struct jbd_trans *trans) +{ + struct jbd_buf *jbd_buf, *tmp; + struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; + TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, + tmp) { + struct ext4_block block = jbd_buf->block; + ext4_block_set(fs->bdev, &block); + } +} + +/**@brief Update the start block of the journal when + * all the contents in a transaction reach the disk.*/ +static void jbd_trans_end_write(struct ext4_bcache *bc __unused, + struct ext4_buf *buf, + int res, + void *arg) +{ + struct jbd_buf *jbd_buf = arg; + struct jbd_trans *trans = jbd_buf->trans; + struct jbd_block_rec *block_rec = jbd_buf->block_rec; + struct jbd_journal *journal = trans->journal; + bool first_in_queue = + trans == TAILQ_FIRST(&journal->cp_queue); + if (res != EOK) + trans->error = res; + + TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node); + TAILQ_REMOVE(&block_rec->dirty_buf_queue, + jbd_buf, + dirty_buf_node); + + jbd_trans_finish_callback(journal, + trans, + jbd_buf->block_rec, + false); + if (block_rec->trans == trans) { + block_rec->buf = NULL; + /* Clear the end_write and end_write_arg fields. */ + buf->end_write = NULL; + buf->end_write_arg = NULL; + } + + free(jbd_buf); + + trans->written_cnt++; + if (trans->written_cnt == trans->data_cnt) { + /* If it is the first transaction on checkpoint queue, + * we will shift the start of the journal to the next + * transaction, and remove subsequent written + * transactions from checkpoint queue until we find + * an unwritten one. */ + if (first_in_queue) { + journal->start = trans->start_iblock + + trans->alloc_blocks; + wrap(&journal->jbd_fs->sb, journal->start); + journal->trans_id = trans->trans_id + 1; + TAILQ_REMOVE(&journal->cp_queue, trans, trans_node); + jbd_journal_free_trans(journal, trans, false); + + jbd_journal_purge_cp_trans(journal, false); + jbd_journal_write_sb(journal); + jbd_write_sb(journal->jbd_fs); + } + } +} + +/**@brief Commit a transaction to the journal immediately. + * @param journal current journal session + * @param trans transaction + * @return standard error code*/ +int jbd_journal_commit_trans(struct jbd_journal *journal, + struct jbd_trans *trans) +{ + int rc = EOK; + uint32_t last = journal->last; + + trans->trans_id = journal->alloc_trans_id; + rc = jbd_journal_prepare(journal, trans); + if (rc != EOK) + goto Finish; + + rc = jbd_journal_prepare_revoke(journal, trans); + if (rc != EOK) + goto Finish; + + if (TAILQ_EMPTY(&trans->buf_queue) && + LIST_EMPTY(&trans->revoke_list)) { + /* Since there are no entries in both buffer list + * and revoke entry list, we do not consider trans as + * complete transaction and just return EOK.*/ + jbd_journal_free_trans(journal, trans, false); + goto Finish; + } + + rc = jbd_trans_write_commit_block(trans); + if (rc != EOK) + goto Finish; + + journal->alloc_trans_id++; + if (TAILQ_EMPTY(&journal->cp_queue)) { + if (trans->data_cnt) { + journal->start = trans->start_iblock; + wrap(&journal->jbd_fs->sb, journal->start); + journal->trans_id = trans->trans_id; + jbd_journal_write_sb(journal); + jbd_write_sb(journal->jbd_fs); + TAILQ_INSERT_TAIL(&journal->cp_queue, trans, + trans_node); + jbd_journal_cp_trans(journal, trans); + } else { + journal->start = trans->start_iblock + + trans->alloc_blocks; + wrap(&journal->jbd_fs->sb, journal->start); + journal->trans_id = trans->trans_id + 1; + jbd_journal_write_sb(journal); + jbd_journal_free_trans(journal, trans, false); + } + } else { + TAILQ_INSERT_TAIL(&journal->cp_queue, trans, + trans_node); + if (trans->data_cnt) + jbd_journal_cp_trans(journal, trans); + + } +Finish: + if (rc != EOK) { + journal->last = last; + jbd_journal_free_trans(journal, trans, true); + } + return rc; +} + +/** + * @} + */