jbd2: refine waiting for shadow buffers
Currently when we add a buffer to a transaction, we wait until the buffer is removed from BJ_Shadow list (so that we prevent any changes to the buffer that is just written to the journal). This can take unnecessarily long as a lot happens between the time the buffer is submitted to the journal and the time when we remove the buffer from BJ_Shadow list. (e.g. We wait for all data buffers in the transaction, we issue a cache flush, etc.) Also this creates a dependency of do_get_write_access() on transaction commit (namely waiting for data IO to complete) which we want to avoid when implementing transaction reservation. So we modify commit code to set new BH_Shadow flag when temporary shadowing buffer is created and we clear that flag once IO on that buffer is complete. This allows do_get_write_access() to wait only for BH_Shadow bit and thus removes the dependency on data IO completion. Reviewed-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
This commit is contained in:
parent
e5a120aeb5
commit
b34090e5e2
6 changed files with 83 additions and 60 deletions
|
@ -30,15 +30,22 @@
|
|||
#include <trace/events/jbd2.h>
|
||||
|
||||
/*
|
||||
* Default IO end handler for temporary BJ_IO buffer_heads.
|
||||
* IO end handler for temporary buffer_heads handling writes to the journal.
|
||||
*/
|
||||
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
|
||||
{
|
||||
struct buffer_head *orig_bh = bh->b_private;
|
||||
|
||||
BUFFER_TRACE(bh, "");
|
||||
if (uptodate)
|
||||
set_buffer_uptodate(bh);
|
||||
else
|
||||
clear_buffer_uptodate(bh);
|
||||
if (orig_bh) {
|
||||
clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
|
||||
smp_mb__after_clear_bit();
|
||||
wake_up_bit(&orig_bh->b_state, BH_Shadow);
|
||||
}
|
||||
unlock_buffer(bh);
|
||||
}
|
||||
|
||||
|
@ -832,6 +839,7 @@ start_journal_io:
|
|||
bh = jh2bh(jh);
|
||||
clear_buffer_jwrite(bh);
|
||||
J_ASSERT_BH(bh, buffer_jbddirty(bh));
|
||||
J_ASSERT_BH(bh, !buffer_shadow(bh));
|
||||
|
||||
/* The metadata is now released for reuse, but we need
|
||||
to remember it against this transaction so that when
|
||||
|
@ -839,14 +847,6 @@ start_journal_io:
|
|||
required. */
|
||||
JBUFFER_TRACE(jh, "file as BJ_Forget");
|
||||
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
|
||||
/*
|
||||
* Wake up any transactions which were waiting for this IO to
|
||||
* complete. The barrier must be here so that changes by
|
||||
* jbd2_journal_file_buffer() take effect before wake_up_bit()
|
||||
* does the waitqueue check.
|
||||
*/
|
||||
smp_mb();
|
||||
wake_up_bit(&bh->b_state, BH_Unshadow);
|
||||
JBUFFER_TRACE(jh, "brelse shadowed buffer");
|
||||
__brelse(bh);
|
||||
}
|
||||
|
|
|
@ -451,6 +451,7 @@ repeat:
|
|||
new_bh->b_size = bh_in->b_size;
|
||||
new_bh->b_bdev = journal->j_dev;
|
||||
new_bh->b_blocknr = blocknr;
|
||||
new_bh->b_private = bh_in;
|
||||
set_buffer_mapped(new_bh);
|
||||
set_buffer_dirty(new_bh);
|
||||
|
||||
|
@ -465,6 +466,7 @@ repeat:
|
|||
spin_lock(&journal->j_list_lock);
|
||||
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
|
||||
spin_unlock(&journal->j_list_lock);
|
||||
set_buffer_shadow(bh_in);
|
||||
jbd_unlock_bh_state(bh_in);
|
||||
|
||||
return do_escape | (done_copy_out << 1);
|
||||
|
|
|
@ -619,6 +619,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
|
|||
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
|
||||
}
|
||||
|
||||
static int sleep_on_shadow_bh(void *word)
|
||||
{
|
||||
io_schedule();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the buffer is already part of the current transaction, then there
|
||||
* is nothing we need to do. If it is already part of a prior
|
||||
|
@ -754,41 +760,29 @@ repeat:
|
|||
* journaled. If the primary copy is already going to
|
||||
* disk then we cannot do copy-out here. */
|
||||
|
||||
if (jh->b_jlist == BJ_Shadow) {
|
||||
DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
|
||||
wait_queue_head_t *wqh;
|
||||
|
||||
wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
|
||||
|
||||
if (buffer_shadow(bh)) {
|
||||
JBUFFER_TRACE(jh, "on shadow: sleep");
|
||||
jbd_unlock_bh_state(bh);
|
||||
/* commit wakes up all shadow buffers after IO */
|
||||
for ( ; ; ) {
|
||||
prepare_to_wait(wqh, &wait.wait,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
if (jh->b_jlist != BJ_Shadow)
|
||||
break;
|
||||
schedule();
|
||||
}
|
||||
finish_wait(wqh, &wait.wait);
|
||||
wait_on_bit(&bh->b_state, BH_Shadow,
|
||||
sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
|
||||
goto repeat;
|
||||
}
|
||||
|
||||
/* Only do the copy if the currently-owning transaction
|
||||
* still needs it. If it is on the Forget list, the
|
||||
* committing transaction is past that stage. The
|
||||
* buffer had better remain locked during the kmalloc,
|
||||
* but that should be true --- we hold the journal lock
|
||||
* still and the buffer is already on the BUF_JOURNAL
|
||||
* list so won't be flushed.
|
||||
/*
|
||||
* Only do the copy if the currently-owning transaction still
|
||||
* needs it. If buffer isn't on BJ_Metadata list, the
|
||||
* committing transaction is past that stage (here we use the
|
||||
* fact that BH_Shadow is set under bh_state lock together with
|
||||
* refiling to BJ_Shadow list and at this point we know the
|
||||
* buffer doesn't have BH_Shadow set).
|
||||
*
|
||||
* Subtle point, though: if this is a get_undo_access,
|
||||
* then we will be relying on the frozen_data to contain
|
||||
* the new value of the committed_data record after the
|
||||
* transaction, so we HAVE to force the frozen_data copy
|
||||
* in that case. */
|
||||
|
||||
if (jh->b_jlist != BJ_Forget || force_copy) {
|
||||
* in that case.
|
||||
*/
|
||||
if (jh->b_jlist == BJ_Metadata || force_copy) {
|
||||
JBUFFER_TRACE(jh, "generate frozen data");
|
||||
if (!frozen_buffer) {
|
||||
JBUFFER_TRACE(jh, "allocate memory for buffer");
|
||||
|
|
|
@ -244,6 +244,31 @@ typedef struct journal_superblock_s
|
|||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
enum jbd_state_bits {
|
||||
BH_JBD /* Has an attached ext3 journal_head */
|
||||
= BH_PrivateStart,
|
||||
BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
|
||||
BH_Freed, /* Has been freed (truncated) */
|
||||
BH_Revoked, /* Has been revoked from the log */
|
||||
BH_RevokeValid, /* Revoked flag is valid */
|
||||
BH_JBDDirty, /* Is dirty but journaled */
|
||||
BH_State, /* Pins most journal_head state */
|
||||
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
|
||||
BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
|
||||
BH_JBDPrivateStart, /* First bit available for private use by FS */
|
||||
};
|
||||
|
||||
BUFFER_FNS(JBD, jbd)
|
||||
BUFFER_FNS(JWrite, jwrite)
|
||||
BUFFER_FNS(JBDDirty, jbddirty)
|
||||
TAS_BUFFER_FNS(JBDDirty, jbddirty)
|
||||
BUFFER_FNS(Revoked, revoked)
|
||||
TAS_BUFFER_FNS(Revoked, revoked)
|
||||
BUFFER_FNS(RevokeValid, revokevalid)
|
||||
TAS_BUFFER_FNS(RevokeValid, revokevalid)
|
||||
BUFFER_FNS(Freed, freed)
|
||||
|
||||
#include <linux/jbd_common.h>
|
||||
|
||||
#define J_ASSERT(assert) BUG_ON(!(assert))
|
||||
|
|
|
@ -302,6 +302,34 @@ typedef struct journal_superblock_s
|
|||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
enum jbd_state_bits {
|
||||
BH_JBD /* Has an attached ext3 journal_head */
|
||||
= BH_PrivateStart,
|
||||
BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
|
||||
BH_Freed, /* Has been freed (truncated) */
|
||||
BH_Revoked, /* Has been revoked from the log */
|
||||
BH_RevokeValid, /* Revoked flag is valid */
|
||||
BH_JBDDirty, /* Is dirty but journaled */
|
||||
BH_State, /* Pins most journal_head state */
|
||||
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
|
||||
BH_Shadow, /* IO on shadow buffer is running */
|
||||
BH_Verified, /* Metadata block has been verified ok */
|
||||
BH_JBDPrivateStart, /* First bit available for private use by FS */
|
||||
};
|
||||
|
||||
BUFFER_FNS(JBD, jbd)
|
||||
BUFFER_FNS(JWrite, jwrite)
|
||||
BUFFER_FNS(JBDDirty, jbddirty)
|
||||
TAS_BUFFER_FNS(JBDDirty, jbddirty)
|
||||
BUFFER_FNS(Revoked, revoked)
|
||||
TAS_BUFFER_FNS(Revoked, revoked)
|
||||
BUFFER_FNS(RevokeValid, revokevalid)
|
||||
TAS_BUFFER_FNS(RevokeValid, revokevalid)
|
||||
BUFFER_FNS(Freed, freed)
|
||||
BUFFER_FNS(Shadow, shadow)
|
||||
BUFFER_FNS(Verified, verified)
|
||||
|
||||
#include <linux/jbd_common.h>
|
||||
|
||||
#define J_ASSERT(assert) BUG_ON(!(assert))
|
||||
|
|
|
@ -1,32 +1,6 @@
|
|||
#ifndef _LINUX_JBD_STATE_H
|
||||
#define _LINUX_JBD_STATE_H
|
||||
|
||||
enum jbd_state_bits {
|
||||
BH_JBD /* Has an attached ext3 journal_head */
|
||||
= BH_PrivateStart,
|
||||
BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
|
||||
BH_Freed, /* Has been freed (truncated) */
|
||||
BH_Revoked, /* Has been revoked from the log */
|
||||
BH_RevokeValid, /* Revoked flag is valid */
|
||||
BH_JBDDirty, /* Is dirty but journaled */
|
||||
BH_State, /* Pins most journal_head state */
|
||||
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
|
||||
BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
|
||||
BH_Verified, /* Metadata block has been verified ok */
|
||||
BH_JBDPrivateStart, /* First bit available for private use by FS */
|
||||
};
|
||||
|
||||
BUFFER_FNS(JBD, jbd)
|
||||
BUFFER_FNS(JWrite, jwrite)
|
||||
BUFFER_FNS(JBDDirty, jbddirty)
|
||||
TAS_BUFFER_FNS(JBDDirty, jbddirty)
|
||||
BUFFER_FNS(Revoked, revoked)
|
||||
TAS_BUFFER_FNS(Revoked, revoked)
|
||||
BUFFER_FNS(RevokeValid, revokevalid)
|
||||
TAS_BUFFER_FNS(RevokeValid, revokevalid)
|
||||
BUFFER_FNS(Freed, freed)
|
||||
BUFFER_FNS(Verified, verified)
|
||||
|
||||
static inline struct buffer_head *jh2bh(struct journal_head *jh)
|
||||
{
|
||||
return jh->b_bh;
|
||||
|
|
Loading…
Reference in a new issue