ext4_journal: code logic optimization. See below.
[lwext4.git] / lwext4 / ext4_journal.c
index 033f3a49fd5d6fda28b86881279d75433af438e9..86366a1bf044a5792dea5395908bc37a6c7f104c 100644 (file)
@@ -42,9 +42,8 @@
 #include "ext4_journal.h"
 #include "ext4_errno.h"
 #include "ext4_blockdev.h"
-#include "ext4_crc32c.h"
+#include "ext4_crc32.h"
 #include "ext4_debug.h"
-#include "tree.h"
 
 #include <string.h>
 #include <stdlib.h>
@@ -75,6 +74,9 @@ struct recover_info {
        /**@brief  Used as internal argument.*/
        uint32_t this_trans_id;
 
+       /**@brief  No of transactions went through.*/
+       uint32_t trans_cnt;
+
        /**@brief  RB-Tree storing revoke entries.*/
        RB_HEAD(jbd_revoke, revoke_entry) revoke_root;
 };
@@ -119,6 +121,224 @@ RB_GENERATE_INTERNAL(jbd_block, jbd_block_rec, block_rec_node,
 #define jbd_alloc_revoke_entry() calloc(1, sizeof(struct revoke_entry))
 #define jbd_free_revoke_entry(addr) free(addr)
 
+static int jbd_has_csum(struct jbd_sb *jbd_sb)
+{
+       if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V2))
+               return 2;
+
+       if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V3))
+               return 3;
+
+       return 0;
+}
+
+#if CONFIG_META_CSUM_ENABLE
+static uint32_t jbd_sb_csum(struct jbd_sb *jbd_sb)
+{
+       uint32_t checksum = 0;
+
+       if (jbd_has_csum(jbd_sb)) {
+               uint32_t orig_checksum = jbd_sb->checksum;
+               jbd_set32(jbd_sb, checksum, 0);
+               /* Calculate crc32c checksum against tho whole superblock */
+               checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_sb,
+                               JBD_SUPERBLOCK_SIZE);
+               jbd_sb->checksum = orig_checksum;
+       }
+       return checksum;
+}
+#else
+#define jbd_sb_csum(...) 0
+#endif
+
+static void jbd_sb_csum_set(struct jbd_sb *jbd_sb)
+{
+       if (!jbd_has_csum(jbd_sb))
+               return;
+
+       jbd_set32(jbd_sb, checksum, jbd_sb_csum(jbd_sb));
+}
+
+#if CONFIG_META_CSUM_ENABLE
+static bool
+jbd_verify_sb_csum(struct jbd_sb *jbd_sb)
+{
+       if (!jbd_has_csum(jbd_sb))
+               return true;
+
+       return jbd_sb_csum(jbd_sb) == jbd_get32(jbd_sb, checksum);
+}
+#else
+#define jbd_verify_sb_csum(...) true
+#endif
+
+#if CONFIG_META_CSUM_ENABLE
+static uint32_t jbd_meta_csum(struct jbd_fs *jbd_fs,
+                             struct jbd_bhdr *bhdr)
+{
+       uint32_t checksum = 0;
+
+       if (jbd_has_csum(&jbd_fs->sb)) {
+               uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
+               struct jbd_block_tail *tail =
+                       (struct jbd_block_tail *)((char *)bhdr + block_size -
+                               sizeof(struct jbd_block_tail));
+               uint32_t orig_checksum = tail->checksum;
+               tail->checksum = 0;
+
+               /* First calculate crc32c checksum against fs uuid */
+               checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
+                                      sizeof(jbd_fs->sb.uuid));
+               /* Calculate crc32c checksum against tho whole block */
+               checksum = ext4_crc32c(checksum, bhdr,
+                               block_size);
+               tail->checksum = orig_checksum;
+       }
+       return checksum;
+}
+#else
+#define jbd_meta_csum(...) 0
+#endif
+
+static void jbd_meta_csum_set(struct jbd_fs *jbd_fs,
+                             struct jbd_bhdr *bhdr)
+{
+       uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
+       struct jbd_block_tail *tail = (struct jbd_block_tail *)
+                               ((char *)bhdr + block_size -
+                               sizeof(struct jbd_block_tail));
+       if (!jbd_has_csum(&jbd_fs->sb))
+               return;
+
+       tail->checksum = to_be32(jbd_meta_csum(jbd_fs, bhdr));
+}
+
+#if CONFIG_META_CSUM_ENABLE
+static bool
+jbd_verify_meta_csum(struct jbd_fs *jbd_fs,
+                    struct jbd_bhdr *bhdr)
+{
+       uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
+       struct jbd_block_tail *tail = (struct jbd_block_tail *)
+                               ((char *)bhdr + block_size -
+                               sizeof(struct jbd_block_tail));
+       if (!jbd_has_csum(&jbd_fs->sb))
+               return true;
+
+       return jbd_meta_csum(jbd_fs, bhdr) == to_be32(tail->checksum);
+}
+#else
+#define jbd_verify_meta_csum(...) true
+#endif
+
+#if CONFIG_META_CSUM_ENABLE
+static uint32_t jbd_commit_csum(struct jbd_fs *jbd_fs,
+                             struct jbd_commit_header *header)
+{
+       uint32_t checksum = 0;
+
+       if (jbd_has_csum(&jbd_fs->sb)) {
+               uint32_t orig_checksum_type = header->chksum_type,
+                        orig_checksum_size = header->chksum_size,
+                        orig_checksum = header->chksum[0];
+               uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
+               header->chksum_type = 0;
+               header->chksum_size = 0;
+               header->chksum[0] = 0;
+
+               /* First calculate crc32c checksum against fs uuid */
+               checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
+                                      sizeof(jbd_fs->sb.uuid));
+               /* Calculate crc32c checksum against tho whole block */
+               checksum = ext4_crc32c(checksum, header,
+                               block_size);
+
+               header->chksum_type = orig_checksum_type;
+               header->chksum_size = orig_checksum_size;
+               header->chksum[0] = orig_checksum;
+       }
+       return checksum;
+}
+#else
+#define jbd_commit_csum(...) 0
+#endif
+
+static void jbd_commit_csum_set(struct jbd_fs *jbd_fs,
+                             struct jbd_commit_header *header)
+{
+       if (!jbd_has_csum(&jbd_fs->sb))
+               return;
+
+       header->chksum_type = 0;
+       header->chksum_size = 0;
+       header->chksum[0] = jbd_commit_csum(jbd_fs, header);
+}
+
+#if CONFIG_META_CSUM_ENABLE
+static bool jbd_verify_commit_csum(struct jbd_fs *jbd_fs,
+                                  struct jbd_commit_header *header)
+{
+       if (!jbd_has_csum(&jbd_fs->sb))
+               return true;
+
+       return header->chksum[0] == to_be32(jbd_commit_csum(jbd_fs,
+                                           header));
+}
+#else
+#define jbd_verify_commit_csum(...) true
+#endif
+
+#if CONFIG_META_CSUM_ENABLE
+/*
+ * NOTE: We only make use of @csum parameter when
+ *       JBD_FEATURE_COMPAT_CHECKSUM is enabled.
+ */
+static uint32_t jbd_block_csum(struct jbd_fs *jbd_fs, const void *buf,
+                              uint32_t csum,
+                              uint32_t sequence)
+{
+       uint32_t checksum = 0;
+
+       if (jbd_has_csum(&jbd_fs->sb)) {
+               uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
+               /* First calculate crc32c checksum against fs uuid */
+               checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
+                                      sizeof(jbd_fs->sb.uuid));
+               /* Then calculate crc32c checksum against sequence no. */
+               checksum = ext4_crc32c(checksum, &sequence,
+                               sizeof(uint32_t));
+               /* Calculate crc32c checksum against tho whole block */
+               checksum = ext4_crc32c(checksum, buf,
+                               block_size);
+       } else if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
+                                    JBD_FEATURE_COMPAT_CHECKSUM)) {
+               uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
+               /* Calculate crc32c checksum against tho whole block */
+               checksum = ext4_crc32(csum, buf,
+                               block_size);
+       }
+       return checksum;
+}
+#else
+#define jbd_block_csum(...) 0
+#endif
+
+static void jbd_block_tag_csum_set(struct jbd_fs *jbd_fs, void *__tag,
+                                  uint32_t checksum)
+{
+       int ver = jbd_has_csum(&jbd_fs->sb);
+       if (!ver)
+               return;
+
+       if (ver == 2) {
+               struct jbd_block_tag *tag = __tag;
+               tag->checksum = (uint16_t)to_be32(checksum);
+       } else {
+               struct jbd_block_tag3 *tag = __tag;
+               tag->checksum = to_be32(checksum);
+       }
+}
+
 /**@brief  Write jbd superblock to disk.
  * @param  jbd_fs jbd filesystem
  * @param  s jbd superblock
@@ -133,6 +353,7 @@ static int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s)
        if (rc != EOK)
                return rc;
 
+       jbd_sb_csum_set(s);
        offset = fblock * ext4_sb_get_block_size(&fs->sb);
        return ext4_block_writebytes(fs->bdev, offset, s,
                                     EXT4_SUPERBLOCK_SIZE);
@@ -170,7 +391,7 @@ static bool jbd_verify_sb(struct jbd_sb *sb)
            jbd_get32(header, blocktype) != JBD_SUPERBLOCK_V2)
                return false;
 
-       return true;
+       return jbd_verify_sb_csum(sb);
 }
 
 /**@brief  Write back dirty jbd superblock to disk.
@@ -281,8 +502,12 @@ static int jbd_block_get(struct jbd_fs *jbd_fs,
 
        /* If succeeded, mark buffer as BC_FLUSH to indicate
         * that data should be written to disk immediately.*/
-       if (rc == EOK)
+       if (rc == EOK) {
                ext4_bcache_set_flag(block->buf, BC_FLUSH);
+               /* As we don't want to occupy too much space
+                * in block cache, we set this buffer BC_TMP.*/
+               ext4_bcache_set_flag(block->buf, BC_TMP);
+       }
 
        return rc;
 }
@@ -370,6 +595,9 @@ struct tag_info {
 
        /**@brief  Is this the last tag? */
        bool last_tag;
+
+       /**@brief  crc32c checksum. */
+       uint32_t checksum;
 };
 
 /**@brief  Extract information from a block tag.
@@ -490,6 +718,8 @@ jbd_write_block_tag(struct jbd_fs *jbd_fs,
                        jbd_set32(tag, flags,
                                  jbd_get32(tag, flags) | JBD_FLAG_SAME_UUID);
 
+               jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum);
+
                if (tag_info->last_tag)
                        jbd_set32(tag, flags,
                                  jbd_get32(tag, flags) | JBD_FLAG_LAST_TAG);
@@ -514,6 +744,8 @@ jbd_write_block_tag(struct jbd_fs *jbd_fs,
                        jbd_set16(tag, flags,
                                  jbd_get16(tag, flags) | JBD_FLAG_SAME_UUID);
 
+               jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum);
+
                if (tag_info->last_tag)
                        jbd_set16(tag, flags,
                                  jbd_get16(tag, flags) | JBD_FLAG_LAST_TAG);
@@ -796,6 +1028,10 @@ static int jbd_iterate_log(struct jbd_fs *jbd_fs,
        /* We start iterating valid blocks in the whole journal.*/
        start_trans_id = this_trans_id = jbd_get32(sb, sequence);
        start_block = this_block = jbd_get32(sb, start);
+       if (action == ACTION_SCAN)
+               info->trans_cnt = 0;
+       else if (!info->trans_cnt)
+               log_end = true;
 
        ext4_dbg(DEBUG_JBD, "Start of journal at trans id: %" PRIu32 "\n",
                            start_trans_id);
@@ -842,6 +1078,14 @@ static int jbd_iterate_log(struct jbd_fs *jbd_fs,
 
                switch (jbd_get32(header, blocktype)) {
                case JBD_DESCRIPTOR_BLOCK:
+                       if (!jbd_verify_meta_csum(jbd_fs, header)) {
+                               ext4_dbg(DEBUG_JBD,
+                                       DBG_WARN "Descriptor block checksum failed."
+                                               "Journal block: %" PRIu32"\n",
+                                               this_block);
+                               log_end = true;
+                               break;
+                       }
                        ext4_dbg(DEBUG_JBD, "Descriptor block: %" PRIu32", "
                                            "trans_id: %" PRIu32"\n",
                                            this_block, this_trans_id);
@@ -859,6 +1103,15 @@ static int jbd_iterate_log(struct jbd_fs *jbd_fs,
 
                        break;
                case JBD_COMMIT_BLOCK:
+                       if (!jbd_verify_commit_csum(jbd_fs,
+                                       (struct jbd_commit_header *)header)) {
+                               ext4_dbg(DEBUG_JBD,
+                                       DBG_WARN "Commit block checksum failed."
+                                               "Journal block: %" PRIu32"\n",
+                                               this_block);
+                               log_end = true;
+                               break;
+                       }
                        ext4_dbg(DEBUG_JBD, "Commit block: %" PRIu32", "
                                            "trans_id: %" PRIu32"\n",
                                            this_block, this_trans_id);
@@ -866,8 +1119,17 @@ static int jbd_iterate_log(struct jbd_fs *jbd_fs,
                         * we may now proceed to the next transaction.
                         */
                        this_trans_id++;
+                       info->trans_cnt++;
                        break;
                case JBD_REVOKE_BLOCK:
+                       if (!jbd_verify_meta_csum(jbd_fs, header)) {
+                               ext4_dbg(DEBUG_JBD,
+                                       DBG_WARN "Revoke block checksum failed."
+                                               "Journal block: %" PRIu32"\n",
+                                               this_block);
+                               log_end = true;
+                               break;
+                       }
                        ext4_dbg(DEBUG_JBD, "Revoke block: %" PRIu32", "
                                            "trans_id: %" PRIu32"\n",
                                            this_block, this_trans_id);
@@ -963,6 +1225,7 @@ int jbd_journal_start(struct jbd_fs *jbd_fs,
        uint32_t features_incompatible =
                        ext4_get32(&jbd_fs->inode_ref.fs->sb,
                                   features_incompatible);
+       struct ext4_block block = EXT4_BLOCK_ZERO();
        features_incompatible |= EXT4_FINCOM_RECOVER;
        ext4_set32(&jbd_fs->inode_ref.fs->sb,
                        features_incompatible,
@@ -980,6 +1243,21 @@ int jbd_journal_start(struct jbd_fs *jbd_fs,
 
        journal->block_size = jbd_get32(&jbd_fs->sb, blocksize);
 
+       r = jbd_block_get_noread(jbd_fs,
+                        &block,
+                        journal->start);
+       if (r != EOK) {
+               memset(journal, 0, sizeof(struct jbd_journal));
+               return r;
+       }
+       memset(block.data, 0, journal->block_size);
+       ext4_bcache_set_dirty(block.buf);
+       r = jbd_block_set(jbd_fs, &block);
+       if (r != EOK) {
+               memset(journal, 0, sizeof(struct jbd_journal));
+               return r;
+       }
+
        TAILQ_INIT(&journal->trans_queue);
        TAILQ_INIT(&journal->cp_queue);
        RB_INIT(&journal->block_rec_root);
@@ -988,16 +1266,43 @@ int jbd_journal_start(struct jbd_fs *jbd_fs,
        return jbd_write_sb(jbd_fs);
 }
 
+static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
+                         struct ext4_buf *buf __unused,
+                         int res,
+                         void *arg);
+
 static void jbd_journal_flush_trans(struct jbd_trans *trans)
 {
        struct jbd_buf *jbd_buf, *tmp;
        struct jbd_journal *journal = trans->journal;
        struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
-       LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node,
+       void *tmp_data = malloc(journal->block_size);
+       ext4_assert(tmp_data);
+
+       TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
                        tmp) {
-               struct ext4_block block = jbd_buf->block;
-               ext4_block_flush_buf(fs->bdev, block.buf);
+               struct ext4_buf *buf = jbd_buf->block_rec->buf;
+               /* The buffer in memory is still dirty. */
+               if (buf) {
+                       if (jbd_buf->block_rec->trans != trans) {
+                               int r;
+                               struct ext4_block jbd_block = EXT4_BLOCK_ZERO();
+                               ext4_assert(ext4_block_get(fs->bdev,
+                                                       &jbd_block,
+                                                       jbd_buf->jbd_lba) == EOK);
+                               memcpy(tmp_data, jbd_block.data,
+                                               journal->block_size);
+                               ext4_block_set(fs->bdev, &jbd_block);
+                               r = ext4_blocks_set_direct(fs->bdev, tmp_data,
+                                               buf->lba, 1);
+                               jbd_trans_end_write(fs->bdev->bc, buf, r, jbd_buf);
+                       } else
+                               ext4_block_flush_buf(fs->bdev, buf);
+
+               }
        }
+
+       free(tmp_data);
 }
 
 static void
@@ -1013,7 +1318,9 @@ jbd_journal_skip_pure_revoke(struct jbd_journal *journal,
        jbd_journal_write_sb(journal);
 }
 
-static void jbd_journal_flush_all_trans(struct jbd_journal *journal)
+static void
+jbd_journal_purge_cp_trans(struct jbd_journal *journal,
+                          bool flush)
 {
        struct jbd_trans *trans;
        while ((trans = TAILQ_FIRST(&journal->cp_queue))) {
@@ -1022,9 +1329,35 @@ static void jbd_journal_flush_all_trans(struct jbd_journal *journal)
                                        trans,
                                        trans_node);
                        jbd_journal_skip_pure_revoke(journal, trans);
-               } else
-                       jbd_journal_flush_trans(trans);
-
+               } else {
+                       if (trans->data_cnt ==
+                                       trans->written_cnt) {
+                               journal->start =
+                                       trans->start_iblock +
+                                       trans->alloc_blocks;
+                               wrap(&journal->jbd_fs->sb,
+                                               journal->start);
+                               journal->trans_id =
+                                       trans->trans_id + 1;
+                               TAILQ_REMOVE(&journal->cp_queue,
+                                               trans,
+                                               trans_node);
+                               jbd_journal_free_trans(journal,
+                                               trans,
+                                               false);
+                               jbd_journal_write_sb(journal);
+                       } else if (!flush) {
+                               journal->start =
+                                       trans->start_iblock;
+                               wrap(&journal->jbd_fs->sb,
+                                               journal->start);
+                               journal->trans_id =
+                                       trans->trans_id;
+                               jbd_journal_write_sb(journal);
+                               break;
+                       } else
+                               jbd_journal_flush_trans(trans);
+               }
        }
 }
 
@@ -1037,12 +1370,9 @@ int jbd_journal_stop(struct jbd_journal *journal)
        struct jbd_fs *jbd_fs = journal->jbd_fs;
        uint32_t features_incompatible;
 
-       /* Commit all the transactions to the journal.*/
-       jbd_journal_commit_all(journal);
-
        /* Make sure that journalled content have reached
         * the disk.*/
-       jbd_journal_flush_all_trans(journal);
+       jbd_journal_purge_cp_trans(journal, true);
 
        /* There should be no block record in this journal
         * session. */
@@ -1085,7 +1415,7 @@ static uint32_t jbd_journal_alloc_block(struct jbd_journal *journal,
        /* If there is no space left, flush all journalled
         * blocks to disk first.*/
        if (journal->last == journal->start)
-               jbd_journal_flush_all_trans(journal);
+               jbd_journal_purge_cp_trans(journal, true);
 
        return start_block;
 }
@@ -1103,15 +1433,12 @@ jbd_journal_new_trans(struct jbd_journal *journal)
        /* We will assign a trans_id to this transaction,
         * once it has been committed.*/
        trans->journal = journal;
+       trans->data_csum = EXT4_CRC32_INIT;
        trans->error = EOK;
+       TAILQ_INIT(&trans->buf_queue);
        return trans;
 }
 
-static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
-                         struct ext4_buf *buf __unused,
-                         int res,
-                         void *arg);
-
 /**@brief  gain access to it before making any modications.
  * @param  journal current journal session
  * @param  trans transaction
@@ -1150,6 +1477,18 @@ jbd_trans_block_rec_lookup(struct jbd_journal *journal,
                       &tmp);
 }
 
+static void
+jbd_trans_change_ownership(struct jbd_block_rec *block_rec,
+                          struct jbd_trans *new_trans,
+                          struct ext4_buf *new_buf)
+{
+       LIST_REMOVE(block_rec, tbrec_node);
+       /* Now this block record belongs to this transaction. */
+       LIST_INSERT_HEAD(&new_trans->tbrec_list, block_rec, tbrec_node);
+       block_rec->trans = new_trans;
+       block_rec->buf = new_buf;
+}
+
 static inline struct jbd_block_rec *
 jbd_trans_insert_block_rec(struct jbd_trans *trans,
                           ext4_fsblk_t lba,
@@ -1158,10 +1497,7 @@ jbd_trans_insert_block_rec(struct jbd_trans *trans,
        struct jbd_block_rec *block_rec;
        block_rec = jbd_trans_block_rec_lookup(trans->journal, lba);
        if (block_rec) {
-               /* Data should be flushed to disk already. */
-               ext4_assert(!block_rec->buf);
-               /* Now this block record belongs to this transaction. */
-               block_rec->trans = trans;
+               jbd_trans_change_ownership(block_rec, trans, buf);
                return block_rec;
        }
        block_rec = calloc(1, sizeof(struct jbd_block_rec));
@@ -1171,18 +1507,77 @@ jbd_trans_insert_block_rec(struct jbd_trans *trans,
        block_rec->lba = lba;
        block_rec->buf = buf;
        block_rec->trans = trans;
+       TAILQ_INIT(&block_rec->dirty_buf_queue);
+       LIST_INSERT_HEAD(&trans->tbrec_list, block_rec, tbrec_node);
        RB_INSERT(jbd_block, &trans->journal->block_rec_root, block_rec);
        return block_rec;
 }
 
+static void
+jbd_trans_finish_callback(struct jbd_journal *journal,
+                         const struct jbd_trans *trans,
+                         struct jbd_block_rec *block_rec,
+                         bool abort)
+{
+       struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
+       if (block_rec->trans != trans)
+               return;
+
+       if (!abort) {
+               struct jbd_buf *jbd_buf, *tmp;
+               TAILQ_FOREACH_SAFE(jbd_buf,
+                               &block_rec->dirty_buf_queue,
+                               dirty_buf_node,
+                               tmp) {
+                       /* All we need is a fake ext4_buf. */
+                       struct ext4_buf buf;
+
+                       jbd_trans_end_write(fs->bdev->bc,
+                                       &buf,
+                                       EOK,
+                                       jbd_buf);
+               }
+       } else {
+               struct jbd_buf *jbd_buf;
+               struct ext4_block jbd_block = EXT4_BLOCK_ZERO(),
+                                 block = EXT4_BLOCK_ZERO();
+               jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue,
+                               jbd_buf_dirty);
+               if (jbd_buf) {
+                       ext4_assert(ext4_block_get(fs->bdev,
+                                               &jbd_block,
+                                               jbd_buf->jbd_lba) == EOK);
+                       ext4_assert(ext4_block_get_noread(fs->bdev,
+                                               &block,
+                                               block_rec->lba) == EOK);
+                       memcpy(block.data, jbd_block.data,
+                                       journal->block_size);
+
+                       jbd_trans_change_ownership(block_rec,
+                                       jbd_buf->trans, block.buf);
+
+                       block.buf->end_write = jbd_trans_end_write;
+                       block.buf->end_write_arg = jbd_buf;
+
+                       ext4_bcache_set_flag(jbd_block.buf, BC_TMP);
+                       ext4_bcache_set_dirty(block.buf);
+
+                       ext4_block_set(fs->bdev, &jbd_block);
+                       ext4_block_set(fs->bdev, &block);
+                       return;
+               }
+       }
+}
+
 static inline void
 jbd_trans_remove_block_rec(struct jbd_journal *journal,
-                          struct jbd_buf *jbd_buf)
+                          struct jbd_block_rec *block_rec,
+                          struct jbd_trans *trans)
 {
-       struct jbd_block_rec *block_rec = jbd_buf->block_rec;
        /* If this block record doesn't belong to this transaction,
         * give up.*/
-       if (block_rec->trans == jbd_buf->trans) {
+       if (block_rec->trans == trans) {
+               LIST_REMOVE(block_rec, tbrec_node);
                RB_REMOVE(jbd_block,
                                &journal->block_rec_root,
                                block_rec);
@@ -1199,35 +1594,41 @@ int jbd_trans_set_block_dirty(struct jbd_trans *trans,
 {
        struct jbd_buf *buf;
 
-       if (!ext4_bcache_test_flag(block->buf, BC_DIRTY) &&
-           block->buf->end_write != jbd_trans_end_write) {
-               struct jbd_block_rec *block_rec;
-               buf = calloc(1, sizeof(struct jbd_buf));
-               if (!buf)
-                       return ENOMEM;
+       struct jbd_block_rec *block_rec;
+       if (block->buf->end_write == jbd_trans_end_write) {
+               buf = block->buf->end_write_arg;
+               if (buf && buf->trans == trans)
+                       return EOK;
+       }
+       buf = calloc(1, sizeof(struct jbd_buf));
+       if (!buf)
+               return ENOMEM;
 
-               if ((block_rec = jbd_trans_insert_block_rec(trans,
+       if ((block_rec = jbd_trans_insert_block_rec(trans,
                                        block->lb_id,
                                        block->buf)) == NULL) {
-                       free(buf);
-                       return ENOMEM;
-               }
+               free(buf);
+               return ENOMEM;
+       }
 
-               buf->block_rec = block_rec;
-               buf->trans = trans;
-               buf->block = *block;
-               ext4_bcache_inc_ref(block->buf);
+       TAILQ_INSERT_TAIL(&block_rec->dirty_buf_queue,
+                       buf,
+                       dirty_buf_node);
 
-               /* If the content reach the disk, notify us
-                * so that we may do a checkpoint. */
-               block->buf->end_write = jbd_trans_end_write;
-               block->buf->end_write_arg = buf;
+       buf->block_rec = block_rec;
+       buf->trans = trans;
+       buf->block = *block;
+       ext4_bcache_inc_ref(block->buf);
 
-               trans->data_cnt++;
-               LIST_INSERT_HEAD(&trans->buf_list, buf, buf_node);
+       /* If the content reach the disk, notify us
+        * so that we may do a checkpoint. */
+       block->buf->end_write = jbd_trans_end_write;
+       block->buf->end_write_arg = buf;
 
-               ext4_bcache_set_dirty(block->buf);
-       }
+       trans->data_cnt++;
+       TAILQ_INSERT_HEAD(&trans->buf_queue, buf, buf_node);
+
+       ext4_bcache_set_dirty(block->buf);
        return EOK;
 }
 
@@ -1290,9 +1691,11 @@ void jbd_journal_free_trans(struct jbd_journal *journal,
 {
        struct jbd_buf *jbd_buf, *tmp;
        struct jbd_revoke_rec *rec, *tmp2;
+       struct jbd_block_rec *block_rec, *tmp3;
        struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
-       LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node,
+       TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
                          tmp) {
+               block_rec = jbd_buf->block_rec;
                if (abort) {
                        jbd_buf->block.buf->end_write = NULL;
                        jbd_buf->block.buf->end_write_arg = NULL;
@@ -1300,8 +1703,14 @@ void jbd_journal_free_trans(struct jbd_journal *journal,
                        ext4_block_set(fs->bdev, &jbd_buf->block);
                }
 
-               jbd_trans_remove_block_rec(journal, jbd_buf);
-               LIST_REMOVE(jbd_buf, buf_node);
+               TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
+                       jbd_buf,
+                       dirty_buf_node);
+               jbd_trans_finish_callback(journal,
+                               trans,
+                               block_rec,
+                               abort);
+               TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
                free(jbd_buf);
        }
        LIST_FOREACH_SAFE(rec, &trans->revoke_list, revoke_node,
@@ -1309,6 +1718,10 @@ void jbd_journal_free_trans(struct jbd_journal *journal,
                LIST_REMOVE(rec, revoke_node);
                free(rec);
        }
+       LIST_FOREACH_SAFE(block_rec, &trans->tbrec_list, tbrec_node,
+                         tmp3) {
+               jbd_trans_remove_block_rec(journal, block_rec, trans);
+       }
 
        free(trans);
 }
@@ -1335,6 +1748,13 @@ static int jbd_trans_write_commit_block(struct jbd_trans *trans)
        jbd_set32(&header->header, blocktype, JBD_COMMIT_BLOCK);
        jbd_set32(&header->header, sequence, trans->trans_id);
 
+       if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb,
+                               JBD_FEATURE_COMPAT_CHECKSUM)) {
+               jbd_set32(header, chksum_type, JBD_CRC32_CHKSUM);
+               jbd_set32(header, chksum_size, JBD_CRC32_CHKSUM_SIZE);
+               jbd_set32(header, chksum[0], trans->data_csum);
+       }
+       jbd_commit_csum_set(journal->jbd_fs, header);
        ext4_bcache_set_dirty(commit_block.buf);
        rc = jbd_block_set(journal->jbd_fs, &commit_block);
        if (rc != EOK)
@@ -1358,21 +1778,70 @@ static int jbd_journal_prepare(struct jbd_journal *journal,
        struct jbd_buf *jbd_buf, *tmp;
        struct ext4_block desc_block, data_block;
        struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
+       uint32_t checksum = EXT4_CRC32_INIT;
+
+       /* Try to remove any non-dirty buffers from the tail of
+        * buf_queue. */
+       TAILQ_FOREACH_REVERSE_SAFE(jbd_buf, &trans->buf_queue,
+                       jbd_trans_buf, buf_node, tmp) {
+               /* We stop the iteration when we find a dirty buffer. */
+               if (ext4_bcache_test_flag(jbd_buf->block.buf,
+                                       BC_DIRTY))
+                       break;
+       
+               TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
+                       jbd_buf,
+                       dirty_buf_node);
+
+               jbd_buf->block.buf->end_write = NULL;
+               jbd_buf->block.buf->end_write_arg = NULL;
+               jbd_trans_finish_callback(journal,
+                               trans,
+                               jbd_buf->block_rec,
+                               true);
+
+               /* The buffer has not been modified, just release
+                * that jbd_buf. */
+               jbd_trans_remove_block_rec(journal,
+                               jbd_buf->block_rec, trans);
+               trans->data_cnt--;
+
+               ext4_block_set(fs->bdev, &jbd_buf->block);
+               TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
+               free(jbd_buf);
+       }
 
-       LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node, tmp) {
+       TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, tmp) {
                struct tag_info tag_info;
                bool uuid_exist = false;
                if (!ext4_bcache_test_flag(jbd_buf->block.buf,
                                           BC_DIRTY)) {
-                       /* The buffer has not been modified, just release
-                        * that jbd_buf. */
+                       TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
+                                       jbd_buf,
+                                       dirty_buf_node);
+
                        jbd_buf->block.buf->end_write = NULL;
                        jbd_buf->block.buf->end_write_arg = NULL;
+                       jbd_trans_finish_callback(journal,
+                                       trans,
+                                       jbd_buf->block_rec,
+                                       true);
+
+                       /* The buffer has not been modified, just release
+                        * that jbd_buf. */
+                       jbd_trans_remove_block_rec(journal,
+                                       jbd_buf->block_rec, trans);
+                       trans->data_cnt--;
+
                        ext4_block_set(fs->bdev, &jbd_buf->block);
-                       LIST_REMOVE(jbd_buf, buf_node);
+                       TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
                        free(jbd_buf);
                        continue;
                }
+               checksum = jbd_block_csum(journal->jbd_fs,
+                                         jbd_buf->block.data,
+                                         checksum,
+                                         trans->trans_id);
 again:
                if (!desc_iblock) {
                        struct jbd_bhdr *bhdr;
@@ -1395,6 +1864,9 @@ again:
                        tag_tbl_size = journal->block_size -
                                sizeof(struct jbd_bhdr);
 
+                       if (jbd_has_csum(&journal->jbd_fs->sb))
+                               tag_tbl_size -= sizeof(struct jbd_block_tail);
+
                        if (!trans->start_iblock)
                                trans->start_iblock = desc_iblock;
 
@@ -1406,6 +1878,8 @@ again:
                else
                        tag_info.last_tag = false;
 
+               tag_info.checksum = checksum;
+
                if (uuid_exist)
                        memcpy(tag_info.uuid, journal->jbd_fs->sb.uuid,
                                        UUID_SIZE);
@@ -1415,6 +1889,8 @@ again:
                                tag_tbl_size,
                                &tag_info);
                if (rc != EOK) {
+                       jbd_meta_csum_set(journal->jbd_fs,
+                                       (struct jbd_bhdr *)desc_block.data);
                        jbd_block_set(journal->jbd_fs, &desc_block);
                        desc_iblock = 0;
                        goto again;
@@ -1430,6 +1906,7 @@ again:
 
                memcpy(data_block.data, jbd_buf->block.data,
                        journal->block_size);
+               jbd_buf->jbd_lba = data_block.lb_id;
 
                rc = jbd_block_set(journal->jbd_fs, &data_block);
                if (rc != EOK)
@@ -1440,8 +1917,12 @@ again:
 
                i++;
        }
-       if (rc == EOK && desc_iblock)
+       if (rc == EOK && desc_iblock) {
+               jbd_meta_csum_set(journal->jbd_fs,
+                               (struct jbd_bhdr *)desc_block.data);
+               trans->data_csum = checksum;
                jbd_block_set(journal->jbd_fs, &desc_block);
+       }
 
        return rc;
 }
@@ -1491,6 +1972,9 @@ again:
                        tag_tbl_size = journal->block_size -
                                sizeof(struct jbd_revoke_header);
 
+                       if (jbd_has_csum(&journal->jbd_fs->sb))
+                               tag_tbl_size -= sizeof(struct jbd_block_tail);
+
                        if (!trans->start_iblock)
                                trans->start_iblock = desc_iblock;
 
@@ -1499,6 +1983,8 @@ again:
                if (tag_tbl_size < record_len) {
                        jbd_set32(header, count,
                                  journal->block_size - tag_tbl_size);
+                       jbd_meta_csum_set(journal->jbd_fs,
+                                       (struct jbd_bhdr *)desc_block.data);
                        jbd_block_set(journal->jbd_fs, &desc_block);
                        desc_iblock = 0;
                        header = NULL;
@@ -1523,24 +2009,14 @@ again:
                        jbd_set32(header, count,
                                  journal->block_size - tag_tbl_size);
 
+               jbd_meta_csum_set(journal->jbd_fs,
+                               (struct jbd_bhdr *)desc_block.data);
                jbd_block_set(journal->jbd_fs, &desc_block);
        }
 
        return rc;
 }
 
-/**@brief  Submit the transaction to transaction queue.
- * @param  journal current journal session
- * @param  trans transaction*/
-void
-jbd_journal_submit_trans(struct jbd_journal *journal,
-                        struct jbd_trans *trans)
-{
-       TAILQ_INSERT_TAIL(&journal->trans_queue,
-                         trans,
-                         trans_node);
-}
-
 /**@brief  Put references of block descriptors in a transaction.
  * @param  journal current journal session
  * @param  trans transaction*/
@@ -1548,7 +2024,7 @@ void jbd_journal_cp_trans(struct jbd_journal *journal, struct jbd_trans *trans)
 {
        struct jbd_buf *jbd_buf, *tmp;
        struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
-       LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node,
+       TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
                        tmp) {
                struct ext4_block block = jbd_buf->block;
                ext4_block_set(fs->bdev, &block);
@@ -1564,48 +2040,47 @@ static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
 {
        struct jbd_buf *jbd_buf = arg;
        struct jbd_trans *trans = jbd_buf->trans;
+       struct jbd_block_rec *block_rec = jbd_buf->block_rec;
        struct jbd_journal *journal = trans->journal;
        bool first_in_queue =
                trans == TAILQ_FIRST(&journal->cp_queue);
        if (res != EOK)
                trans->error = res;
 
-       LIST_REMOVE(jbd_buf, buf_node);
-       jbd_buf->block_rec->buf = NULL;
-       jbd_trans_remove_block_rec(journal, jbd_buf);
-       free(jbd_buf);
+       TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
+       TAILQ_REMOVE(&block_rec->dirty_buf_queue,
+                       jbd_buf,
+                       dirty_buf_node);
+
+       jbd_trans_finish_callback(journal,
+                       trans,
+                       jbd_buf->block_rec,
+                       false);
+       if (block_rec->trans == trans) {
+               block_rec->buf = NULL;
+               /* Clear the end_write and end_write_arg fields. */
+               buf->end_write = NULL;
+               buf->end_write_arg = NULL;
+       }
 
-       /* Clear the end_write and end_write_arg fields. */
-       buf->end_write = NULL;
-       buf->end_write_arg = NULL;
+       free(jbd_buf);
 
        trans->written_cnt++;
        if (trans->written_cnt == trans->data_cnt) {
-               TAILQ_REMOVE(&journal->cp_queue, trans, trans_node);
-
+               /* If it is the first transaction on checkpoint queue,
+                * we will shift the start of the journal to the next
+                * transaction, and remove subsequent written
+                * transactions from checkpoint queue until we find
+                * an unwritten one. */
                if (first_in_queue) {
                        journal->start = trans->start_iblock +
                                trans->alloc_blocks;
                        wrap(&journal->jbd_fs->sb, journal->start);
                        journal->trans_id = trans->trans_id + 1;
-               }
-               jbd_journal_free_trans(journal, trans, false);
+                       TAILQ_REMOVE(&journal->cp_queue, trans, trans_node);
+                       jbd_journal_free_trans(journal, trans, false);
 
-               if (first_in_queue) {
-                       while ((trans = TAILQ_FIRST(&journal->cp_queue))) {
-                               if (!trans->data_cnt) {
-                                       TAILQ_REMOVE(&journal->cp_queue,
-                                                    trans,
-                                                    trans_node);
-                                       jbd_journal_skip_pure_revoke(journal,
-                                                                    trans);
-                               } else {
-                                       journal->start = trans->start_iblock;
-                                       wrap(&journal->jbd_fs->sb, journal->start);
-                                       journal->trans_id = trans->trans_id;
-                                       break;
-                               }
-                       }
+                       jbd_journal_purge_cp_trans(journal, false);
                        jbd_journal_write_sb(journal);
                        jbd_write_sb(journal->jbd_fs);
                }
@@ -1631,7 +2106,7 @@ int jbd_journal_commit_trans(struct jbd_journal *journal,
        if (rc != EOK)
                goto Finish;
 
-       if (LIST_EMPTY(&trans->buf_list) &&
+       if (TAILQ_EMPTY(&trans->buf_queue) &&
            LIST_EMPTY(&trans->revoke_list)) {
                /* Since there are no entries in both buffer list
                 * and revoke entry list, we do not consider trans as
@@ -1678,29 +2153,6 @@ Finish:
        return rc;
 }
 
-/**@brief  Commit one transaction on transaction queue
- *         to the journal.
- * @param  journal current journal session.*/
-void jbd_journal_commit_one(struct jbd_journal *journal)
-{
-       struct jbd_trans *trans;
-
-       if ((trans = TAILQ_FIRST(&journal->trans_queue))) {
-               TAILQ_REMOVE(&journal->trans_queue, trans, trans_node);
-               jbd_journal_commit_trans(journal, trans);
-       }
-}
-
-/**@brief  Commit all the transactions on transaction queue
- *         to the journal.
- * @param  journal current journal session.*/
-void jbd_journal_commit_all(struct jbd_journal *journal)
-{
-       while (!TAILQ_EMPTY(&journal->trans_queue)) {
-               jbd_journal_commit_one(journal);
-       }
-}
-
 /**
  * @}
  */