Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2

* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (31 commits)
  ocfs2: clean up bh null checks
  ocfs2: document access rules for blocked_lock_list
  configfs: file.c fix possible recursive locking
  configfs: dir.c fix possible recursive locking
  configfs: Remove EXPERIMENTAL
  ocfs2: bump version number
  ocfs2/dlm: Clear joining_node on hearbeat node down
  ocfs2: convert byte order of constant instead of variable
  ocfs2: Update default cluster timeouts
  ocfs2: printf fixes
  ocfs2: Use generic_file_llseek
  ocfs2: Safer read_inline_data()
  ocfs2: Silence false lockdep warnings
  [PATCH 2/2] ocfs2: cluster aware flock()
  [PATCH 1/2] ocfs2: add flock lock type
  ocfs2: Local alloc window size changeable via mount option
  ocfs2: Support commit= mount option
  ocfs2: Add missing permission checks
  [PATCH 2/2] ocfs2: Implement group add for online resize
  [PATCH 1/2] ocfs2: Add group extend for online resize
  ...
This commit is contained in:
Linus Torvalds 2008-01-25 17:11:13 -08:00
commit 29bd17af7d
53 changed files with 2019 additions and 1606 deletions

View file

@ -35,7 +35,6 @@ Features which OCFS2 does not support yet:
- Directory change notification (F_NOTIFY)
- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
- POSIX ACLs
- readpages / writepages (not user visible)
Mount options
=============
@ -62,3 +61,18 @@ data=writeback Data ordering is not preserved, data may be written
preferred_slot=0(*) During mount, try to use this filesystem slot first. If
it is in use by another node, the first empty one found
will be chosen. Invalid values will be ignored.
commit=nrsec (*) Ocfs2 can be told to sync all its data and metadata
every 'nrsec' seconds. The default value is 5 seconds.
This means that if you lose your power, you will lose
as much as the latest 5 seconds of work (your
filesystem will not be damaged though, thanks to the
journaling). This default value (or any low value)
will hurt performance, but it's good for data-safety.
Setting it to 0 will have the same effect as leaving
it at the default (5 seconds).
Setting it to very large values will improve
performance.
localalloc=8(*) Allows custom localalloc size in MB. If the value is too
large, the fs will silently revert it to the default.
Localalloc is not enabled for local mounts.
localflocks This disables cluster aware flock.

View file

@ -138,6 +138,7 @@ Code Seq# Include File Comments
'm' 00-1F net/irda/irmod.h conflict!
'n' 00-7F linux/ncp_fs.h
'n' E0-FF video/matrox.h matroxfb
'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2
'p' 00-0F linux/phantom.h conflict! (OpenHaptics needs this)
'p' 00-3F linux/mc146818rtc.h conflict!
'p' 40-7F linux/nvram.h

View file

@ -440,14 +440,8 @@ config OCFS2_FS
Tools web page: http://oss.oracle.com/projects/ocfs2-tools
OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
Note: Features which OCFS2 does not support yet:
- extended attributes
- quotas
- cluster aware flock
- Directory change notification (F_NOTIFY)
- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
- POSIX ACLs
- readpages / writepages (not user visible)
For more information on OCFS2, see the file
<file:Documentation/filesystems/ocfs2.txt>.
config OCFS2_DEBUG_MASKLOG
bool "OCFS2 logging support"
@ -1028,8 +1022,8 @@ config HUGETLB_PAGE
def_bool HUGETLBFS
config CONFIGFS_FS
tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
depends on SYSFS && EXPERIMENTAL
tristate "Userspace-driven configuration filesystem"
depends on SYSFS
help
configfs is a ram-based filesystem that provides the converse
of sysfs's functionality. Where sysfs is a filesystem-based

View file

@ -546,7 +546,7 @@ static int populate_groups(struct config_group *group)
* That said, taking our i_mutex is closer to mkdir
* emulation, and shouldn't hurt.
*/
mutex_lock(&dentry->d_inode->i_mutex);
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
for (i = 0; group->default_groups[i]; i++) {
new_group = group->default_groups[i];
@ -1405,7 +1405,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
sd = configfs_sb->s_root->d_fsdata;
link_group(to_config_group(sd->s_element), group);
mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
I_MUTEX_PARENT);
name.name = group->cg_item.ci_name;
name.len = strlen(name.name);

View file

@ -320,7 +320,7 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
int error = 0;
mutex_lock(&dir->d_inode->i_mutex);
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
mutex_unlock(&dir->d_inode->i_mutex);

View file

@ -19,16 +19,17 @@ ocfs2-objs := \
ioctl.o \
journal.o \
localalloc.o \
locks.o \
mmap.o \
namei.o \
resize.o \
slot_map.o \
suballoc.o \
super.o \
symlink.o \
sysfile.o \
uptodate.o \
ver.o \
vote.o
ver.o
obj-$(CONFIG_OCFS2_FS) += cluster/
obj-$(CONFIG_OCFS2_FS) += dlm/

View file

@ -4731,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
mutex_lock(&data_alloc_inode->i_mutex);
status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
if (status < 0) {
mlog_errno(status);
goto out_mutex;
@ -4753,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
out_unlock:
brelse(data_alloc_bh);
ocfs2_meta_unlock(data_alloc_inode, 1);
ocfs2_inode_unlock(data_alloc_inode, 1);
out_mutex:
mutex_unlock(&data_alloc_inode->i_mutex);
@ -5077,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb,
mutex_lock(&inode->i_mutex);
ret = ocfs2_meta_lock(inode, &di_bh, 1);
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
mlog_errno(ret);
goto out_mutex;
@ -5118,7 +5118,7 @@ out_journal:
ocfs2_commit_trans(osb, handle);
out_unlock:
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
out_mutex:
mutex_unlock(&inode->i_mutex);

View file

@ -26,6 +26,7 @@
#include <asm/byteorder.h>
#include <linux/swap.h>
#include <linux/pipe_fs_i.h>
#include <linux/mpage.h>
#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
{
int err = 0;
unsigned int ext_flags;
u64 p_blkno, past_eof;
u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
u64 p_blkno, count, past_eof;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
&ext_flags);
if (err) {
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
if (max_blocks < count)
count = max_blocks;
/*
* ocfs2 never allocates in this function - the only time we
* need to use BH_New is when we're extending i_size on a file
@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
map_bh(bh_result, inode->i_sb, p_blkno);
bh_result->b_size = count << inode->i_blkbits;
if (!ocfs2_sparse_alloc(osb)) {
if (p_blkno == 0) {
err = -EIO;
@ -210,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
struct buffer_head *di_bh)
{
void *kaddr;
unsigned int size;
loff_t size;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
@ -224,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
if (size > PAGE_CACHE_SIZE ||
size > ocfs2_max_inline_data(inode->i_sb)) {
ocfs2_error(inode->i_sb,
"Inode %llu has with inline data has bad size: %u",
(unsigned long long)OCFS2_I(inode)->ip_blkno, size);
"Inode %llu has with inline data has bad size: %Lu",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)size);
return -EROFS;
}
@ -275,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
@ -285,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
ret = AOP_TRUNCATED_PAGE;
goto out_meta_unlock;
goto out_inode_unlock;
}
/*
@ -305,25 +313,16 @@ static int ocfs2_readpage(struct file *file, struct page *page)
goto out_alloc;
}
ret = ocfs2_data_lock_with_page(inode, 0, page);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
mlog_errno(ret);
goto out_alloc;
}
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
ret = ocfs2_readpage_inline(inode, page);
else
ret = block_read_full_page(page, ocfs2_get_block);
unlock = 0;
ocfs2_data_unlock(inode, 0);
out_alloc:
up_read(&OCFS2_I(inode)->ip_alloc_sem);
out_meta_unlock:
ocfs2_meta_unlock(inode, 0);
out_inode_unlock:
ocfs2_inode_unlock(inode, 0);
out:
if (unlock)
unlock_page(page);
@ -331,6 +330,62 @@ out:
return ret;
}
/*
* This is used only for read-ahead. Failures or difficult to handle
* situations are safe to ignore.
*
* Right now, we don't bother with BH_Boundary - in-inode extent lists
* are quite large (243 extents on 4k blocks), so most inodes don't
* grow out to a tree. If need be, detecting boundary extents could
* trivially be added in a future version of ocfs2_get_block().
*/
static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
int ret, err = -EIO;
struct inode *inode = mapping->host;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
loff_t start;
struct page *last;
/*
* Use the nonblocking flag for the dlm code to avoid page
* lock inversion, but don't bother with retrying.
*/
ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
if (ret)
return err;
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
ocfs2_inode_unlock(inode, 0);
return err;
}
/*
* Don't bother with inline-data. There isn't anything
* to read-ahead in that case anyway...
*/
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
goto out_unlock;
/*
* Check whether a remote node truncated this file - we just
* drop out in that case as it's not worth handling here.
*/
last = list_entry(pages->prev, struct page, lru);
start = (loff_t)last->index << PAGE_CACHE_SHIFT;
if (start >= i_size_read(inode))
goto out_unlock;
err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
out_unlock:
up_read(&oi->ip_alloc_sem);
ocfs2_inode_unlock(inode, 0);
return err;
}
/* Note: Because we don't support holes, our allocation has
* already happened (allocation writes zeros to the file data)
* so we don't have to worry about ordered writes in
@ -452,7 +507,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
* accessed concurrently from multiple nodes.
*/
if (!INODE_JOURNAL(inode)) {
err = ocfs2_meta_lock(inode, NULL, 0);
err = ocfs2_inode_lock(inode, NULL, 0);
if (err) {
if (err != -ENOENT)
mlog_errno(err);
@ -467,7 +522,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
if (!INODE_JOURNAL(inode)) {
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_meta_unlock(inode, 0);
ocfs2_inode_unlock(inode, 0);
}
if (err) {
@ -638,34 +693,12 @@ static ssize_t ocfs2_direct_IO(int rw,
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
return 0;
if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
/*
* We get PR data locks even for O_DIRECT. This
* allows concurrent O_DIRECT I/O but doesn't let
* O_DIRECT with extending and buffered zeroing writes
* race. If they did race then the buffered zeroing
* could be written back after the O_DIRECT I/O. It's
* one thing to tell people not to mix buffered and
* O_DIRECT writes, but expecting them to understand
* that file extension is also an implicit buffered
* write is too much. By getting the PR we force
* writeback of the buffered zeroing before
* proceeding.
*/
ret = ocfs2_data_lock(inode, 0);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
ocfs2_data_unlock(inode, 0);
}
ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
inode->i_sb->s_bdev, iov, offset,
nr_segs,
ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io);
out:
mlog_exit(ret);
return ret;
}
@ -1754,7 +1787,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
struct buffer_head *di_bh = NULL;
struct inode *inode = mapping->host;
ret = ocfs2_meta_lock(inode, &di_bh, 1);
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
mlog_errno(ret);
return ret;
@ -1769,30 +1802,22 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_data_lock(inode, 1);
if (ret) {
mlog_errno(ret);
goto out_fail;
}
ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
goto out_fail_data;
goto out_fail;
}
brelse(di_bh);
return 0;
out_fail_data:
ocfs2_data_unlock(inode, 1);
out_fail:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
brelse(di_bh);
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
return ret;
}
@ -1908,15 +1933,15 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
ocfs2_data_unlock(inode, 1);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
return ret;
}
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
.readpages = ocfs2_readpages,
.writepage = ocfs2_writepage,
.write_begin = ocfs2_write_begin,
.write_end = ocfs2_write_end,

View file

@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
brelse(bh);
put_bh(bh);
}
mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
brelse(bh);
put_bh(bh);
bhs[i] = NULL;
continue;
}
@ -280,3 +280,64 @@ bail:
mlog_exit(status);
return status;
}
/* Check whether the blkno is the super block or one of the backups. */
static void ocfs2_check_super_or_backup(struct super_block *sb,
sector_t blkno)
{
int i;
u64 backup_blkno;
if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
return;
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
backup_blkno = ocfs2_backup_super_blkno(sb, i);
if (backup_blkno == blkno)
return;
}
BUG();
}
/*
* Write super block and backups doesn't need to collaborate with journal,
* so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
* into this function.
*/
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh)
{
int ret = 0;
mlog_entry_void();
BUG_ON(buffer_jbd(bh));
ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
ret = -EROFS;
goto out;
}
lock_buffer(bh);
set_buffer_uptodate(bh);
/* remove from dirty list before I/O. */
clear_buffer_dirty(bh);
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
submit_bh(WRITE, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ret = -EIO;
put_bh(bh);
}
out:
mlog_exit(ret);
return ret;
}

View file

@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super *osb,
int flags,
struct inode *inode);
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh);
#define OCFS2_BH_CACHED 1
#define OCFS2_BH_READAHEAD 8

View file

@ -35,7 +35,7 @@
#define O2HB_LIVE_THRESHOLD 2
/* number of equal samples to be seen as dead */
extern unsigned int o2hb_dead_threshold;
#define O2HB_DEFAULT_DEAD_THRESHOLD 7
#define O2HB_DEFAULT_DEAD_THRESHOLD 31
/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
#define O2HB_MIN_DEAD_THRESHOLD 2
#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))

View file

@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
/* same as hb delay, we're waiting for another node to recognize our hb */
#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000
#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000
#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000
#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
/* TODO: figure this out.... */

View file

@ -38,6 +38,12 @@
* locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
* New in version 10:
* - Meta/data locks combined
*
* New in version 9:
* - All votes removed
*
* New in version 8:
* - Replace delete inode votes with a cluster lock
*
@ -60,7 +66,7 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
#define O2NET_PROTOCOL_VERSION 8ULL
#define O2NET_PROTOCOL_VERSION 10ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;

View file

@ -28,7 +28,7 @@
#include "ver.h"
#define CLUSTER_BUILD_VERSION "1.3.3"
#define CLUSTER_BUILD_VERSION "1.5.0"
#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION

View file

@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry,
/*
* Walk the inode alias list, and find a dentry which has a given
* parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
* is looking for a dentry_lock reference. The vote thread is looking
* to unhash aliases, so we allow it to skip any that already have
* that property.
* is looking for a dentry_lock reference. The downconvert thread is
* looking to unhash aliases, so we allow it to skip any that already
* have that property.
*/
struct dentry *ocfs2_find_local_alias(struct inode *inode,
u64 parent_blkno,
@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
dl->dl_count = 0;
/*
* Does this have to happen below, for all attaches, in case
* the struct inode gets blown away by votes?
* the struct inode gets blown away by the downconvert thread?
*/
dl->dl_inode = igrab(inode);
dl->dl_parent_blkno = parent_blkno;

View file

@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
mlog_entry("dirino=%llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
if (lock_level && error >= 0) {
/* We release EX lock which used to update atime
* and get PR lock again to reduce contention
* on commonly accessed directories. */
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
lock_level = 0;
error = ocfs2_meta_lock(inode, NULL, 0);
error = ocfs2_inode_lock(inode, NULL, 0);
}
if (error < 0) {
if (error != -ENOENT)
@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
dirent, filldir, NULL);
ocfs2_meta_unlock(inode, lock_level);
ocfs2_inode_unlock(inode, lock_level);
bail_nolock:
mlog_exit(error);

View file

@ -28,7 +28,7 @@
#include "dlmfsver.h"
#define DLM_BUILD_VERSION "1.3.3"
#define DLM_BUILD_VERSION "1.5.0"
#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION

View file

@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
}
}
/* Clean up join state on node death. */
if (dlm->joining_node == idx) {
mlog(0, "Clearing join state for node %u\n", idx);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
}
/* check to see if the node is already considered dead */
if (!test_bit(idx, dlm->live_nodes_map)) {
mlog(0, "for domain %s, node %d is already dead. "
@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
clear_bit(idx, dlm->live_nodes_map);
/* Clean up join state on node death. */
if (dlm->joining_node == idx) {
mlog(0, "Clearing join state for node %u\n", idx);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
}
/* make sure local cleanup occurs before the heartbeat events */
if (!test_bit(idx, dlm->recovery_map))
dlm_do_local_recovery_cleanup(dlm, idx);
@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
if (!dlm_grab(dlm))
return;
/*
* This will notify any dlm users that a node in our domain
* went away without notifying us first.
*/
if (test_bit(idx, dlm->domain_map))
dlm_fire_domain_eviction_callbacks(dlm, idx);
spin_lock(&dlm->spinlock);
__dlm_hb_node_down(dlm, idx);
spin_unlock(&dlm->spinlock);

View file

@ -28,7 +28,7 @@
#include "dlmver.h"
#define DLM_BUILD_VERSION "1.3.3"
#define DLM_BUILD_VERSION "1.5.0"
#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION

View file

@ -55,7 +55,6 @@
#include "slot_map.h"
#include "super.h"
#include "uptodate.h"
#include "vote.h"
#include "buffer_head_io.h"
@ -69,6 +68,7 @@ struct ocfs2_mask_waiter {
static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
/*
* Return value from ->downconvert_worker functions.
@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops {
struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
/*
* Optionally called in the downconvert (or "vote") thread
* after a successful downconvert. The lockres will not be
* referenced after this callback is called, so it is safe to
* free memory, etc.
* Optionally called in the downconvert thread after a
* successful downconvert. The lockres will not be referenced
* after this callback is called, so it is safe to free
* memory, etc.
*
* The exact semantics of when this is called are controlled
* by ->downconvert_worker()
@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
.flags = 0,
};
static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
.get_osb = ocfs2_get_inode_osb,
.check_downconvert = ocfs2_check_meta_downconvert,
.set_lvb = ocfs2_set_meta_lvb,
.flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
};
static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
.get_osb = ocfs2_get_inode_osb,
.downconvert_worker = ocfs2_data_convert_worker,
.flags = 0,
.flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
};
static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
.flags = 0,
};
static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
.get_osb = ocfs2_get_file_osb,
.flags = 0,
};
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
lockres->l_type == OCFS2_LOCK_TYPE_RW ||
lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
}
@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
"resource %s: %s\n", dlm_errname(_stat), _func, \
_lockres->l_name, dlm_errmsg(_stat)); \
} while (0)
static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
static int ocfs2_meta_lock_update(struct inode *inode,
static int ocfs2_downconvert_thread(void *arg);
static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
static int ocfs2_inode_lock_update(struct inode *inode,
struct buffer_head **bh);
static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
static inline int ocfs2_highest_compat_lock_level(int level);
static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
int new_level);
static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int new_level,
int lvb);
static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
static int ocfs2_cancel_convert(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
u64 blkno,
@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
ops = &ocfs2_inode_rw_lops;
break;
case OCFS2_LOCK_TYPE_META:
ops = &ocfs2_inode_meta_lops;
break;
case OCFS2_LOCK_TYPE_DATA:
ops = &ocfs2_inode_data_lops;
ops = &ocfs2_inode_inode_lops;
break;
case OCFS2_LOCK_TYPE_OPEN:
ops = &ocfs2_inode_open_lops;
@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
return OCFS2_SB(inode->i_sb);
}
static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
{
struct ocfs2_file_private *fp = lockres->l_priv;
return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
}
static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
{
__be64 inode_blkno_be;
@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_rename_lops, osb);
}
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp)
{
struct inode *inode = fp->fp_file->f_mapping->host;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
ocfs2_lock_res_init_once(lockres);
ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
inode->i_generation, lockres->l_name);
ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
fp);
lockres->l_flags |= OCFS2_LOCK_NOCACHE;
}
void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
{
mlog_entry_void();
@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
lockres->l_name, level, lockres->l_level,
ocfs2_lock_type_string(lockres->l_type));
/*
* We can skip the bast for locks which don't enable caching -
* they'll be dropped at the earliest possible time anyway.
*/
if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
return;
spin_lock_irqsave(&lockres->l_lock, flags);
needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
if (needs_downconvert)
@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
wake_up(&lockres->l_event);
ocfs2_kick_vote_thread(osb);
ocfs2_wake_downconvert_thread(osb);
}
static void ocfs2_locking_ast(void *opaque)
@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
}
static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
struct ocfs2_lock_res *lockres)
{
int ret;
ret = wait_for_completion_interruptible(&mw->mw_complete);
if (ret)
lockres_remove_mask_waiter(lockres, mw);
else
ret = mw->mw_status;
/* Re-arm the completion in case we want to wait on it again */
INIT_COMPLETION(mw->mw_complete);
return ret;
}
static int ocfs2_cluster_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int level,
@ -1089,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
ocfs2_dec_holders(lockres, level);
ocfs2_vote_on_unlock(osb, lockres);
ocfs2_downconvert_on_unlock(osb, lockres);
spin_unlock_irqrestore(&lockres->l_lock, flags);
mlog_exit_void();
}
@ -1147,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
* We don't want to use LKM_LOCAL on a meta data lock as they
* don't use a generation in their lock names.
*/
ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0);
if (ret) {
mlog_errno(ret);
goto bail;
}
ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
if (ret) {
mlog_errno(ret);
goto bail;
@ -1311,76 +1357,221 @@ out:
mlog_exit_void();
}
int ocfs2_data_lock_full(struct inode *inode,
int write,
int arg_flags)
{
int status = 0, level;
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
BUG_ON(!inode);
mlog_entry_void();
mlog(0, "inode %llu take %s DATA lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
/* We'll allow faking a readonly data lock for
* rodevices. */
if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
if (write) {
status = -EROFS;
mlog_errno(status);
}
goto out;
}
if (ocfs2_mount_local(osb))
goto out;
lockres = &OCFS2_I(inode)->ip_data_lockres;
level = write ? LKM_EXMODE : LKM_PRMODE;
status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
0, arg_flags);
if (status < 0 && status != -EAGAIN)
mlog_errno(status);
out:
mlog_exit(status);
return status;
}
/* see ocfs2_meta_lock_with_page() */
int ocfs2_data_lock_with_page(struct inode *inode,
int write,
struct page *page)
static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
int level)
{
int ret;
struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
unsigned long flags;
struct ocfs2_mask_waiter mw;
ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
unlock_page(page);
if (ocfs2_data_lock(inode, write) == 0)
ocfs2_data_unlock(inode, write);
ret = AOP_TRUNCATED_PAGE;
ocfs2_init_mask_waiter(&mw);
retry_cancel:
spin_lock_irqsave(&lockres->l_lock, flags);
if (lockres->l_flags & OCFS2_LOCK_BUSY) {
ret = ocfs2_prepare_cancel_convert(osb, lockres);
if (ret) {
spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = ocfs2_cancel_convert(osb, lockres);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
goto retry_cancel;
}
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
ocfs2_wait_for_mask(&mw);
goto retry_cancel;
}
ret = -ERESTARTSYS;
/*
* We may still have gotten the lock, in which case there's no
* point to restarting the syscall.
*/
if (lockres->l_level == level)
ret = 0;
mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
lockres->l_flags, lockres->l_level, lockres->l_action);
spin_unlock_irqrestore(&lockres->l_lock, flags);
out:
return ret;
}
static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
/*
* ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
* flock() calls. The locking approach this requires is sufficiently
* different from all other cluster lock types that we implement a
* seperate path to the "low-level" dlm calls. In particular:
*
* - No optimization of lock levels is done - we take at exactly
* what's been requested.
*
* - No lock caching is employed. We immediately downconvert to
* no-lock at unlock time. This also means flock locks never go on
* the blocking list).
*
* - Since userspace can trivially deadlock itself with flock, we make
* sure to allow cancellation of a misbehaving applications flock()
* request.
*
* - Access to any flock lockres doesn't require concurrency, so we
* can simplify the code by requiring the caller to guarantee
* serialization of dlmglue flock calls.
*/
int ocfs2_file_lock(struct file *file, int ex, int trylock)
{
int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
unsigned long flags;
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
struct ocfs2_mask_waiter mw;
ocfs2_init_mask_waiter(&mw);
if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
(lockres->l_level > LKM_NLMODE)) {
mlog(ML_ERROR,
"File lock \"%s\" has busy or locked state: flags: 0x%lx, "
"level: %u\n", lockres->l_name, lockres->l_flags,
lockres->l_level);
return -EINVAL;
}
spin_lock_irqsave(&lockres->l_lock, flags);
if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
/*
* Get the lock at NLMODE to start - that way we
* can cancel the upconvert request if need be.
*/
ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_wait_for_mask(&mw);
if (ret) {
mlog_errno(ret);
goto out;
}
spin_lock_irqsave(&lockres->l_lock, flags);
}
lockres->l_action = OCFS2_AST_CONVERT;
lkm_flags |= LKM_CONVERT;
lockres->l_requested = level;
lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
if (ret != DLM_NORMAL) {
if (trylock && ret == DLM_NOTQUEUED)
ret = -EAGAIN;
else {
ocfs2_log_dlm_error("dlmlock", ret, lockres);
ret = -EINVAL;
}
ocfs2_recover_from_dlm_error(lockres, 1);
lockres_remove_mask_waiter(lockres, &mw);
goto out;
}
ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
if (ret == -ERESTARTSYS) {
/*
* Userspace can cause deadlock itself with
* flock(). Current behavior locally is to allow the
* deadlock, but abort the system call if a signal is
* received. We follow this example, otherwise a
* poorly written program could sit in kernel until
* reboot.
*
* Handling this is a bit more complicated for Ocfs2
* though. We can't exit this function with an
* outstanding lock request, so a cancel convert is
* required. We intentionally overwrite 'ret' - if the
* cancel fails and the lock was granted, it's easier
* to just bubble sucess back up to the user.
*/
ret = ocfs2_flock_handle_signal(lockres, level);
}
out:
mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
lockres->l_name, ex, trylock, ret);
return ret;
}
void ocfs2_file_unlock(struct file *file)
{
int ret;
unsigned long flags;
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
struct ocfs2_mask_waiter mw;
ocfs2_init_mask_waiter(&mw);
if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
return;
if (lockres->l_level == LKM_NLMODE)
return;
mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
lockres->l_name, lockres->l_flags, lockres->l_level,
lockres->l_action);
spin_lock_irqsave(&lockres->l_lock, flags);
/*
* Fake a blocking ast for the downconvert code.
*/
lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
lockres->l_blocking = LKM_EXMODE;
ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
if (ret) {
mlog_errno(ret);
return;
}
ret = ocfs2_wait_for_mask(&mw);
if (ret)
mlog_errno(ret);
}
static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
int kick = 0;
mlog_entry_void();
/* If we know that another node is waiting on our lock, kick
* the vote thread * pre-emptively when we reach a release
* the downconvert thread * pre-emptively when we reach a release
* condition. */
if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
switch(lockres->l_blocking) {
@ -1398,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
}
if (kick)
ocfs2_kick_vote_thread(osb);
mlog_exit_void();
}
void ocfs2_data_unlock(struct inode *inode,
int write)
{
int level = write ? LKM_EXMODE : LKM_PRMODE;
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry_void();
mlog(0, "inode %llu drop %s DATA lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
ocfs2_wake_downconvert_thread(osb);
mlog_exit_void();
}
@ -1442,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
/* Call this with the lockres locked. I am reasonably sure we don't
* need ip_lock in this function as anyone who would be changing those
* values is supposed to be blocked in ocfs2_meta_lock right now. */
* values is supposed to be blocked in ocfs2_inode_lock right now. */
static void __ocfs2_stuff_meta_lvb(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
mlog_entry_void();
@ -1496,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
mlog_entry_void();
@ -1604,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
}
/* may or may not return a bh if it went to disk. */
static int ocfs2_meta_lock_update(struct inode *inode,
static int ocfs2_inode_lock_update(struct inode *inode,
struct buffer_head **bh)
{
int status = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_dinode *fe;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@ -1721,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode,
* returns < 0 error if the callback will never be called, otherwise
* the result of the lock will be communicated via the callback.
*/
int ocfs2_meta_lock_full(struct inode *inode,
int ocfs2_inode_lock_full(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
int arg_flags)
@ -1756,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
wait_event(osb->recovery_event,
ocfs2_node_map_is_empty(osb, &osb->recovery_map));
lockres = &OCFS2_I(inode)->ip_meta_lockres;
lockres = &OCFS2_I(inode)->ip_inode_lockres;
level = ex ? LKM_EXMODE : LKM_PRMODE;
dlm_flags = 0;
if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@ -1795,11 +1966,11 @@ local:
}
/* This is fun. The caller may want a bh back, or it may
* not. ocfs2_meta_lock_update definitely wants one in, but
* not. ocfs2_inode_lock_update definitely wants one in, but
* may or may not read one, depending on what's in the
* LVB. The result of all of this is that we've *only* gone to
* disk if we have to, so the complexity is worthwhile. */
status = ocfs2_meta_lock_update(inode, &local_bh);
status = ocfs2_inode_lock_update(inode, &local_bh);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -1821,7 +1992,7 @@ bail:
*ret_bh = NULL;
}
if (acquired)
ocfs2_meta_unlock(inode, ex);
ocfs2_inode_unlock(inode, ex);
}
if (local_bh)
@ -1832,19 +2003,20 @@ bail:
}
/*
* This is working around a lock inversion between tasks acquiring DLM locks
* while holding a page lock and the vote thread which blocks dlm lock acquiry
* while acquiring page locks.
* This is working around a lock inversion between tasks acquiring DLM
* locks while holding a page lock and the downconvert thread which
* blocks dlm lock acquiry while acquiring page locks.
*
* ** These _with_page variantes are only intended to be called from aop
* methods that hold page locks and return a very specific *positive* error
* code that aop methods pass up to the VFS -- test for errors with != 0. **
*
* The DLM is called such that it returns -EAGAIN if it would have blocked
* waiting for the vote thread. In that case we unlock our page so the vote
* thread can make progress. Once we've done this we have to return
* AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
* into the VFS who will then immediately retry the aop call.
* The DLM is called such that it returns -EAGAIN if it would have
* blocked waiting for the downconvert thread. In that case we unlock
* our page so the downconvert thread can make progress. Once we've
* done this we have to return AOP_TRUNCATED_PAGE so the aop method
* that called us can bubble that back up into the VFS who will then
* immediately retry the aop call.
*
* We do a blocking lock and immediate unlock before returning, though, so that
* the lock has a great chance of being cached on this node by the time the VFS
@ -1852,32 +2024,32 @@ bail:
* ping locks back and forth, but that's a risk we're willing to take to avoid
* the lock inversion simply.
*/
int ocfs2_meta_lock_with_page(struct inode *inode,
int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct page *page)
{
int ret;
ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
unlock_page(page);
if (ocfs2_meta_lock(inode, ret_bh, ex) == 0)
ocfs2_meta_unlock(inode, ex);
if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
}
return ret;
}
int ocfs2_meta_lock_atime(struct inode *inode,
int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level)
{
int ret;
mlog_entry_void();
ret = ocfs2_meta_lock(inode, NULL, 0);
ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret < 0) {
mlog_errno(ret);
return ret;
@ -1890,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
if (ocfs2_should_update_atime(inode, vfsmnt)) {
struct buffer_head *bh = NULL;
ocfs2_meta_unlock(inode, 0);
ret = ocfs2_meta_lock(inode, &bh, 1);
ocfs2_inode_unlock(inode, 0);
ret = ocfs2_inode_lock(inode, &bh, 1);
if (ret < 0) {
mlog_errno(ret);
return ret;
@ -1908,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
return ret;
}
void ocfs2_meta_unlock(struct inode *inode,
void ocfs2_inode_unlock(struct inode *inode,
int ex)
{
int level = ex ? LKM_EXMODE : LKM_PRMODE;
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry_void();
@ -2320,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto bail;
}
/* launch vote thread */
osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
if (IS_ERR(osb->vote_task)) {
status = PTR_ERR(osb->vote_task);
osb->vote_task = NULL;
/* launch downconvert thread */
osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
if (IS_ERR(osb->dc_task)) {
status = PTR_ERR(osb->dc_task);
osb->dc_task = NULL;
mlog_errno(status);
goto bail;
}
@ -2353,8 +2525,8 @@ local:
bail:
if (status < 0) {
ocfs2_dlm_shutdown_debug(osb);
if (osb->vote_task)
kthread_stop(osb->vote_task);
if (osb->dc_task)
kthread_stop(osb->dc_task);
}
mlog_exit(status);
@ -2369,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
ocfs2_drop_osb_locks(osb);
if (osb->vote_task) {
kthread_stop(osb->vote_task);
osb->vote_task = NULL;
if (osb->dc_task) {
kthread_stop(osb->dc_task);
osb->dc_task = NULL;
}
ocfs2_lock_res_free(&osb->osb_super_lockres);
@ -2527,7 +2699,7 @@ out:
/* Mark the lockres as being dropped. It will no longer be
* queued if blocking, but we still may have to wait on it
* being dequeued from the vote thread before we can consider
* being dequeued from the downconvert thread before we can consider
* it safe to drop.
*
* You can *not* attempt to call cluster_lock on this lockres anymore. */
@ -2590,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_data_lockres);
if (err < 0)
mlog_errno(err);
if (err < 0 && !status)
status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_meta_lockres);
&OCFS2_I(inode)->ip_inode_lockres);
if (err < 0)
mlog_errno(err);
if (err < 0 && !status)
@ -2850,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
inode = ocfs2_lock_res_inode(lockres);
mapping = inode->i_mapping;
if (S_ISREG(inode->i_mode))
goto out;
/*
* We need this before the filemap_fdatawrite() so that it can
* transfer the dirty bit from the PTE to the
@ -2875,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
filemap_fdatawait(mapping);
}
out:
return UNBLOCK_CONTINUE;
}
@ -2903,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
/*
* Does the final reference drop on our dentry lock. Right now this
* happens in the vote thread, but we could choose to simplify the
* happens in the downconvert thread, but we could choose to simplify the
* dlmglue API and push these off to the ocfs2_wq in the future.
*/
static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@ -3042,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
mlog(0, "lockres %s blocked.\n", lockres->l_name);
/* Detect whether a lock has been marked as going away while
* the vote thread was processing other things. A lock can
* the downconvert thread was processing other things. A lock can
* still be marked with OCFS2_LOCK_FREEING after this check,
* but short circuiting here will still save us some
* performance. */
@ -3091,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
spin_lock(&osb->vote_task_lock);
spin_lock(&osb->dc_task_lock);
if (list_empty(&lockres->l_blocked_list)) {
list_add_tail(&lockres->l_blocked_list,
&osb->blocked_lock_list);
osb->blocked_lock_count++;
}
spin_unlock(&osb->vote_task_lock);
spin_unlock(&osb->dc_task_lock);
mlog_exit_void();
}
static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
{
unsigned long processed;
struct ocfs2_lock_res *lockres;
mlog_entry_void();
spin_lock(&osb->dc_task_lock);
/* grab this early so we know to try again if a state change and
* wake happens part-way through our work */
osb->dc_work_sequence = osb->dc_wake_sequence;
processed = osb->blocked_lock_count;
while (processed) {
BUG_ON(list_empty(&osb->blocked_lock_list));
lockres = list_entry(osb->blocked_lock_list.next,
struct ocfs2_lock_res, l_blocked_list);
list_del_init(&lockres->l_blocked_list);
osb->blocked_lock_count--;
spin_unlock(&osb->dc_task_lock);
BUG_ON(!processed);
processed--;
ocfs2_process_blocked_lock(osb, lockres);
spin_lock(&osb->dc_task_lock);
}
spin_unlock(&osb->dc_task_lock);
mlog_exit_void();
}
static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
{
int empty = 0;
spin_lock(&osb->dc_task_lock);
if (list_empty(&osb->blocked_lock_list))
empty = 1;
spin_unlock(&osb->dc_task_lock);
return empty;
}
static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
{
int should_wake = 0;
spin_lock(&osb->dc_task_lock);
if (osb->dc_work_sequence != osb->dc_wake_sequence)
should_wake = 1;
spin_unlock(&osb->dc_task_lock);
return should_wake;
}
int ocfs2_downconvert_thread(void *arg)
{
int status = 0;
struct ocfs2_super *osb = arg;
/* only quit once we've been asked to stop and there is no more
* work available */
while (!(kthread_should_stop() &&
ocfs2_downconvert_thread_lists_empty(osb))) {
wait_event_interruptible(osb->dc_event,
ocfs2_downconvert_thread_should_wake(osb) ||
kthread_should_stop());
mlog(0, "downconvert_thread: awoken\n");
ocfs2_downconvert_thread_do_work(osb);
}
osb->dc_task = NULL;
return status;
}
void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
{
spin_lock(&osb->dc_task_lock);
/* make sure the voting thread gets a swipe at whatever changes
* the caller may have made to the voting state */
osb->dc_wake_sequence++;
spin_unlock(&osb->dc_task_lock);
wake_up(&osb->dc_event);
}

View file

@ -49,12 +49,12 @@ struct ocfs2_meta_lvb {
__be32 lvb_reserved2;
};
/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
/* Instruct the dlm not to queue ourselves on the other node. */
#define OCFS2_META_LOCK_NOQUEUE (0x02)
/* don't block waiting for the vote thread, instead return -EAGAIN */
/* don't block waiting for the downconvert thread, instead return -EAGAIN */
#define OCFS2_LOCK_NONBLOCK (0x04)
int ocfs2_dlm_init(struct ocfs2_super *osb);
@ -66,38 +66,32 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
struct inode *inode);
void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
u64 parent, struct inode *inode);
struct ocfs2_file_private;
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp);
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
int ocfs2_data_lock_full(struct inode *inode,
int write,
int arg_flags);
#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
int ocfs2_data_lock_with_page(struct inode *inode,
int write,
struct page *page);
void ocfs2_data_unlock(struct inode *inode,
int write);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
int ocfs2_open_lock(struct inode *inode);
int ocfs2_try_open_lock(struct inode *inode, int write);
void ocfs2_open_unlock(struct inode *inode);
int ocfs2_meta_lock_atime(struct inode *inode,
int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);
int ocfs2_meta_lock_full(struct inode *inode,
int ocfs2_inode_lock_full(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
int arg_flags);
int ocfs2_meta_lock_with_page(struct inode *inode,
int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct page *page);
/* 99% of the time we don't want to supply any additional flags --
* those are for very specific cases only. */
#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0)
void ocfs2_meta_unlock(struct inode *inode,
#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
void ocfs2_inode_unlock(struct inode *inode,
int ex);
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
@ -107,14 +101,17 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
int ocfs2_file_lock(struct file *file, int ex, int trylock);
void ocfs2_file_unlock(struct file *file);
void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
/* for the vote thread */
/* for the downconvert thread */
void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);

View file

@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val)
*var = cpu_to_le64(le64_to_cpu(*var) + val);
}
static inline void le32_and_cpu(__le32 *var, u32 val)
{
*var = cpu_to_le32(le32_to_cpu(*var) & val);
}
static inline void be32_add_cpu(__be32 *var, u32 val)
{
*var = cpu_to_be32(be32_to_cpu(*var) + val);

View file

@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
return ERR_PTR(-ESTALE);
}
inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0);
inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
if (IS_ERR(inode))
return (void *)inode;
@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
mlog(0, "find parent of directory %llu\n",
(unsigned long long)OCFS2_I(dir)->ip_blkno);
status = ocfs2_meta_lock(dir, NULL, 0);
status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
goto bail_unlock;
}
inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
if (IS_ERR(inode)) {
mlog(ML_ERROR, "Unable to create inode %llu\n",
(unsigned long long)blkno);
@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
parent->d_op = &ocfs2_dentry_ops;
bail_unlock:
ocfs2_meta_unlock(dir, 0);
ocfs2_inode_unlock(dir, 0);
bail:
mlog_exit_ptr(parent);

View file

@ -51,6 +51,7 @@
#include "inode.h"
#include "ioctl.h"
#include "journal.h"
#include "locks.h"
#include "mmap.h"
#include "suballoc.h"
#include "super.h"
@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode)
return sync_mapping_buffers(inode->i_mapping);
}
static int ocfs2_init_file_private(struct inode *inode, struct file *file)
{
struct ocfs2_file_private *fp;
fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
if (!fp)
return -ENOMEM;
fp->fp_file = file;
mutex_init(&fp->fp_mutex);
ocfs2_file_lock_res_init(&fp->fp_flock, fp);
file->private_data = fp;
return 0;
}
static void ocfs2_free_file_private(struct inode *inode, struct file *file)
{
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (fp) {
ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
ocfs2_lock_res_free(&fp->fp_flock);
kfree(fp);
file->private_data = NULL;
}
}
static int ocfs2_file_open(struct inode *inode, struct file *file)
{
int status;
@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
oi->ip_open_count++;
spin_unlock(&oi->ip_lock);
status = 0;
status = ocfs2_init_file_private(inode, file);
if (status) {
/*
* We want to set open count back if we're failing the
* open.
*/
spin_lock(&oi->ip_lock);
oi->ip_open_count--;
spin_unlock(&oi->ip_lock);
}
leave:
mlog_exit(status);
return status;
@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
spin_unlock(&oi->ip_lock);
ocfs2_free_file_private(inode, file);
mlog_exit(0);
return 0;
}
static int ocfs2_dir_open(struct inode *inode, struct file *file)
{
return ocfs2_init_file_private(inode, file);
}
static int ocfs2_dir_release(struct inode *inode, struct file *file)
{
ocfs2_free_file_private(inode, file);
return 0;
}
static int ocfs2_sync_file(struct file *file,
struct dentry *dentry,
int datasync)
@ -382,18 +436,13 @@ static int ocfs2_truncate_file(struct inode *inode,
down_write(&OCFS2_I(inode)->ip_alloc_sem);
/* This forces other nodes to sync and drop their pages. Do
* this even if we have a truncate without allocation change -
* ocfs2 cluster sizes can be much greater than page size, so
* we have to truncate them anyway. */
status = ocfs2_data_lock(inode, 1);
if (status < 0) {
up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog_errno(status);
goto bail;
}
/*
* The inode lock forced other nodes to sync and drop their
* pages, which (correctly) happens even if we have a truncate
* without allocation change - ocfs2 cluster sizes can be much
* greater than page size, so we have to truncate them
* anyway.
*/
unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
truncate_inode_pages(inode->i_mapping, new_i_size);
@ -403,7 +452,7 @@ static int ocfs2_truncate_file(struct inode *inode,
if (status)
mlog_errno(status);
goto bail_unlock_data;
goto bail_unlock_sem;
}
/* alright, we're going to need to do a full blown alloc size
@ -413,25 +462,23 @@ static int ocfs2_truncate_file(struct inode *inode,
status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
if (status < 0) {
mlog_errno(status);
goto bail_unlock_data;
goto bail_unlock_sem;
}
status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
if (status < 0) {
mlog_errno(status);
goto bail_unlock_data;
goto bail_unlock_sem;
}
status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
if (status < 0) {
mlog_errno(status);
goto bail_unlock_data;
goto bail_unlock_sem;
}
/* TODO: orphan dir cleanup here. */
bail_unlock_data:
ocfs2_data_unlock(inode, 1);
bail_unlock_sem:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
bail:
@ -579,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
"clusters_to_add = %u, extents_to_split = %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
(unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
num_free_extents = ocfs2_num_free_extents(osb, inode, di);
@ -760,7 +807,7 @@ restarted_transaction:
le32_to_cpu(fe->i_clusters),
(unsigned long long)le64_to_cpu(fe->i_size));
mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
OCFS2_I(inode)->ip_clusters, i_size_read(inode));
OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
leave:
if (handle) {
@ -917,7 +964,7 @@ static int ocfs2_extend_file(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size)
{
int ret = 0, data_locked = 0;
int ret = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
BUG_ON(!di_bh);
@ -943,20 +990,6 @@ static int ocfs2_extend_file(struct inode *inode,
&& ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
goto out_update_size;
/*
* protect the pages that ocfs2_zero_extend is going to be
* pulling into the page cache.. we do this before the
* metadata extend so that we don't get into the situation
* where we've extended the metadata but can't get the data
* lock to zero.
*/
ret = ocfs2_data_lock(inode, 1);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
data_locked = 1;
/*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
@ -980,7 +1013,7 @@ static int ocfs2_extend_file(struct inode *inode,
up_write(&oi->ip_alloc_sem);
mlog_errno(ret);
goto out_unlock;
goto out;
}
}
@ -991,7 +1024,7 @@ static int ocfs2_extend_file(struct inode *inode,
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
goto out;
}
out_update_size:
@ -999,10 +1032,6 @@ out_update_size:
if (ret < 0)
mlog_errno(ret);
out_unlock:
if (data_locked)
ocfs2_data_unlock(inode, 1);
out:
return ret;
}
@ -1050,7 +1079,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
status = ocfs2_meta_lock(inode, &bh, 1);
status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -1102,7 +1131,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
@ -1149,7 +1178,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
mlog_entry_void();
ret = ocfs2_meta_lock(inode, NULL, 0);
ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret) {
if (ret != -ENOENT)
mlog_errno(ret);
@ -1158,7 +1187,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
ret = generic_permission(inode, mask, NULL);
ocfs2_meta_unlock(inode, 0);
ocfs2_inode_unlock(inode, 0);
out:
mlog_exit(ret);
return ret;
@ -1630,7 +1659,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out;
}
ret = ocfs2_meta_lock(inode, &di_bh, 1);
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
mlog_errno(ret);
goto out_rw_unlock;
@ -1638,7 +1667,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
ret = -EPERM;
goto out_meta_unlock;
goto out_inode_unlock;
}
switch (sr->l_whence) {
@ -1652,7 +1681,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
break;
default:
ret = -EINVAL;
goto out_meta_unlock;
goto out_inode_unlock;
}
sr->l_whence = 0;
@ -1663,14 +1692,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
|| (sr->l_start + llen) < 0
|| (sr->l_start + llen) > max_off) {
ret = -EINVAL;
goto out_meta_unlock;
goto out_inode_unlock;
}
size = sr->l_start + sr->l_len;
if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
if (sr->l_len <= 0) {
ret = -EINVAL;
goto out_meta_unlock;
goto out_inode_unlock;
}
}
@ -1678,7 +1707,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
ret = __ocfs2_write_remove_suid(inode, di_bh);
if (ret) {
mlog_errno(ret);
goto out_meta_unlock;
goto out_inode_unlock;
}
}
@ -1704,7 +1733,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
up_write(&OCFS2_I(inode)->ip_alloc_sem);
if (ret) {
mlog_errno(ret);
goto out_meta_unlock;
goto out_inode_unlock;
}
/*
@ -1714,7 +1743,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out_meta_unlock;
goto out_inode_unlock;
}
if (change_size && i_size_read(inode) < size)
@ -1727,9 +1756,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
ocfs2_commit_trans(osb, handle);
out_meta_unlock:
out_inode_unlock:
brelse(di_bh);
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
@ -1799,7 +1828,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
* if we need to make modifications here.
*/
for(;;) {
ret = ocfs2_meta_lock(inode, NULL, meta_level);
ret = ocfs2_inode_lock(inode, NULL, meta_level);
if (ret < 0) {
meta_level = -1;
mlog_errno(ret);
@ -1817,7 +1846,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
* set inode->i_size at the end of a write. */
if (should_remove_suid(dentry)) {
if (meta_level == 0) {
ocfs2_meta_unlock(inode, meta_level);
ocfs2_inode_unlock(inode, meta_level);
meta_level = 1;
continue;
}
@ -1886,7 +1915,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
*ppos = saved_pos;
out_unlock:
ocfs2_meta_unlock(inode, meta_level);
ocfs2_inode_unlock(inode, meta_level);
out:
return ret;
@ -2099,12 +2128,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
/*
* See the comment in ocfs2_file_aio_read()
*/
ret = ocfs2_meta_lock(inode, NULL, 0);
ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret < 0) {
mlog_errno(ret);
goto bail;
}
ocfs2_meta_unlock(inode, 0);
ocfs2_inode_unlock(inode, 0);
ret = generic_file_splice_read(in, ppos, pipe, len, flags);
@ -2160,12 +2189,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
* like i_size. This allows the checks down below
* generic_file_aio_read() a chance of actually working.
*/
ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto bail;
}
ocfs2_meta_unlock(inode, lock_level);
ocfs2_inode_unlock(inode, lock_level);
ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
if (ret == -EINVAL)
@ -2204,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = {
};
const struct file_operations ocfs2_fops = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
@ -2216,16 +2246,21 @@ const struct file_operations ocfs2_fops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
.splice_write = ocfs2_file_splice_write,
};
const struct file_operations ocfs2_dops = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
.ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
.flock = ocfs2_flock,
};

View file

@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
extern const struct inode_operations ocfs2_special_file_iops;
struct ocfs2_alloc_context;
struct ocfs2_file_private {
struct file *fp_file;
struct mutex fp_mutex;
struct ocfs2_lock_res fp_flock;
};
enum ocfs2_alloc_restarted {
RESTART_NONE = 0,
RESTART_TRANS,

View file

@ -30,9 +30,6 @@
#include <linux/highmem.h>
#include <linux/kmod.h>
#include <cluster/heartbeat.h>
#include <cluster/nodemanager.h>
#include <dlm/dlmapi.h>
#define MLOG_MASK_PREFIX ML_SUPER
@ -44,13 +41,9 @@
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
#include "vote.h"
#include "buffer_head_io.h"
#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
int bit);
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
@ -64,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
void ocfs2_init_node_maps(struct ocfs2_super *osb)
{
spin_lock_init(&osb->node_map_lock);
ocfs2_node_map_init(&osb->mounted_map);
ocfs2_node_map_init(&osb->recovery_map);
ocfs2_node_map_init(&osb->umount_map);
ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
}
@ -87,24 +78,7 @@ static void ocfs2_do_node_down(int node_num,
return;
}
if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
/* If a node is in the umount map, then we've been
* expecting him to go down and we know ahead of time
* that recovery is not necessary. */
ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
return;
}
ocfs2_recovery_thread(osb, node_num);
ocfs2_remove_node_from_vote_queues(osb, node_num);
}
static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
int node_num,
void *data)
{
ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
}
/* Called from the dlm when it's about to evict a node. We may also
@ -121,27 +95,8 @@ static void ocfs2_dlm_eviction_cb(int node_num,
ocfs2_do_node_down(node_num, osb);
}
static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
int node_num,
void *data)
{
struct ocfs2_super *osb = data;
BUG_ON(osb->node_num == node_num);
mlog(0, "node up event for %d\n", node_num);
ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
}
void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
{
o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
ocfs2_hb_node_down_cb, osb,
OCFS2_HB_NODE_DOWN_PRI);
o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
/* Not exactly a heartbeat callback, but leads to essentially
* the same path so we set it up here. */
dlm_setup_eviction_cb(&osb->osb_eviction_cb,
@ -149,39 +104,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
osb);
}
/* Most functions here are just stubs for now... */
int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
{
int status;
if (ocfs2_mount_local(osb))
return 0;
status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
if (status < 0) {
mlog_errno(status);
goto bail;
}
status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
if (status < 0) {
mlog_errno(status);
o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
}
bail:
return status;
}
void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
{
if (ocfs2_mount_local(osb))
return;
o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
}
void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
{
int ret;
@ -341,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb,
spin_lock(&osb->node_map_lock);
__ocfs2_node_map_clear_bit(&osb->mounted_map, num);
if (!test_bit(num, osb->recovery_map.map)) {
__ocfs2_node_map_set_bit(&osb->recovery_map, num);
set = 1;

View file

@ -29,8 +29,6 @@
void ocfs2_init_node_maps(struct ocfs2_super *osb);
void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
/* node map functions - used to keep track of mounted and in-recovery

View file

@ -49,7 +49,6 @@
#include "symlink.h"
#include "sysfile.h"
#include "uptodate.h"
#include "vote.h"
#include "buffer_head_io.h"
@ -58,8 +57,11 @@ struct ocfs2_find_inode_args
u64 fi_blkno;
unsigned long fi_ino;
unsigned int fi_flags;
unsigned int fi_sysfile_type;
};
static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
static int ocfs2_read_locked_inode(struct inode *inode,
struct ocfs2_find_inode_args *args);
static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@ -107,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
oi->ip_attr |= OCFS2_DIRSYNC_FL;
}
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
int sysfile_type)
{
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
@ -127,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
args.fi_blkno = blkno;
args.fi_flags = flags;
args.fi_ino = ino_from_blkno(sb, blkno);
args.fi_sysfile_type = sysfile_type;
inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
ocfs2_init_locked_inode, &args);
@ -201,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
inode->i_ino = args->fi_ino;
OCFS2_I(inode)->ip_blkno = args->fi_blkno;
if (args->fi_sysfile_type != 0)
lockdep_set_class(&inode->i_mutex,
&ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
mlog_exit(0);
return 0;
@ -322,7 +329,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
*/
BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
OCFS2_LOCK_TYPE_META, 0, inode);
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
@ -333,10 +340,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
OCFS2_LOCK_TYPE_RW, inode->i_generation,
inode);
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
OCFS2_LOCK_TYPE_DATA, inode->i_generation,
inode);
ocfs2_set_inode_flags(inode);
status = 0;
@ -414,7 +417,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
generation = osb->fs_generation;
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
OCFS2_LOCK_TYPE_META,
generation, inode);
@ -429,7 +432,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
mlog_errno(status);
return status;
}
status = ocfs2_meta_lock(inode, NULL, 0);
status = ocfs2_inode_lock(inode, NULL, 0);
if (status) {
make_bad_inode(inode);
mlog_errno(status);
@ -484,7 +487,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
bail:
if (can_lock)
ocfs2_meta_unlock(inode, 0);
ocfs2_inode_unlock(inode, 0);
if (status < 0)
make_bad_inode(inode);
@ -586,7 +589,7 @@ static int ocfs2_remove_inode(struct inode *inode,
}
mutex_lock(&inode_alloc_inode->i_mutex);
status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1);
status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
if (status < 0) {
mutex_unlock(&inode_alloc_inode->i_mutex);
@ -617,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
}
di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
status = ocfs2_journal_dirty(handle, di_bh);
if (status < 0) {
@ -635,7 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
ocfs2_meta_unlock(inode_alloc_inode, 1);
ocfs2_inode_unlock(inode_alloc_inode, 1);
mutex_unlock(&inode_alloc_inode->i_mutex);
brelse(inode_alloc_bh);
bail:
@ -709,7 +712,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
* delete_inode operation. We do this now to avoid races with
* recovery completion on other nodes. */
mutex_lock(&orphan_dir_inode->i_mutex);
status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
mutex_unlock(&orphan_dir_inode->i_mutex);
@ -718,8 +721,8 @@ static int ocfs2_wipe_inode(struct inode *inode,
}
/* we do this while holding the orphan dir lock because we
* don't want recovery being run from another node to vote for
* an inode delete on us -- this will result in two nodes
* don't want recovery being run from another node to try an
* inode delete underneath us -- this will result in two nodes
* truncating the same file! */
status = ocfs2_truncate_for_delete(osb, inode, di_bh);
if (status < 0) {
@ -733,7 +736,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
mlog_errno(status);
bail_unlock_dir:
ocfs2_meta_unlock(orphan_dir_inode, 1);
ocfs2_inode_unlock(orphan_dir_inode, 1);
mutex_unlock(&orphan_dir_inode->i_mutex);
brelse(orphan_dir_bh);
bail:
@ -744,7 +747,7 @@ bail:
}
/* There is a series of simple checks that should be done before a
* vote is even considered. Encapsulate those in this function. */
* trylock is even considered. Encapsulate those in this function. */
static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
{
int ret = 0;
@ -758,14 +761,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail;
}
/* If we're coming from process_vote we can't go into our own
/* If we're coming from downconvert_thread we can't go into our own
* voting [hello, deadlock city!], so unforuntately we just
* have to skip deleting this guy. That's OK though because
* the node who's doing the actual deleting should handle it
* anyway. */
if (current == osb->vote_task) {
if (current == osb->dc_task) {
mlog(0, "Skipping delete of %lu because we're currently "
"in process_vote\n", inode->i_ino);
"in downconvert\n", inode->i_ino);
goto bail;
}
@ -779,10 +782,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail_unlock;
}
/* If we have voted "yes" on the wipe of this inode for
* another node, it will be marked here so we can safely skip
* it. Recovery will cleanup any inodes we might inadvertantly
* skip here. */
/* If we have allowd wipe of this inode for another node, it
* will be marked here so we can safely skip it. Recovery will
* cleanup any inodes we might inadvertantly skip here. */
if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
mlog(0, "Skipping delete of %lu because another node "
"has done this for us.\n", inode->i_ino);
@ -929,13 +931,13 @@ void ocfs2_delete_inode(struct inode *inode)
/* Lock down the inode. This gives us an up to date view of
* it's metadata (for verification), and allows us to
* serialize delete_inode votes.
* serialize delete_inode on multiple nodes.
*
* Even though we might be doing a truncate, we don't take the
* allocation lock here as it won't be needed - nobody will
* have the file open.
*/
status = ocfs2_meta_lock(inode, &di_bh, 1);
status = ocfs2_inode_lock(inode, &di_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -947,15 +949,15 @@ void ocfs2_delete_inode(struct inode *inode)
* before we go ahead and wipe the inode. */
status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
if (!wipe || status < 0) {
/* Error and inode busy vote both mean we won't be
/* Error and remote inode busy both mean we won't be
* removing the inode, so they take almost the same
* path. */
if (status < 0)
mlog_errno(status);
/* Someone in the cluster has voted to not wipe this
* inode, or it was never completely orphaned. Write
* out the pages and exit now. */
/* Someone in the cluster has disallowed a wipe of
* this inode, or it was never completely
* orphaned. Write out the pages and exit now. */
ocfs2_cleanup_delete_inode(inode, 1);
goto bail_unlock_inode;
}
@ -981,7 +983,7 @@ void ocfs2_delete_inode(struct inode *inode)
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
bail_unlock_inode:
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
bail_unblock:
status = sigprocmask(SIG_SETMASK, &oldset, NULL);
@ -1008,15 +1010,14 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
/* For remove delete_inode vote, we hold open lock before,
* now it is time to unlock PR and EX open locks. */
/* To preven remote deletes we hold open lock before, now it
* is time to unlock PR and EX open locks. */
ocfs2_open_unlock(inode);
/* Do these before all the other work so that we don't bounce
* the vote thread while waiting to destroy the locks. */
* the downconvert thread while waiting to destroy the locks. */
ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
/* We very well may get a clear_inode before all an inodes
@ -1039,8 +1040,7 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_errno(status);
ocfs2_lock_res_free(&oi->ip_rw_lockres);
ocfs2_lock_res_free(&oi->ip_meta_lockres);
ocfs2_lock_res_free(&oi->ip_data_lockres);
ocfs2_lock_res_free(&oi->ip_inode_lockres);
ocfs2_lock_res_free(&oi->ip_open_lockres);
ocfs2_metadata_cache_purge(inode);
@ -1184,15 +1184,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
/* Let ocfs2_meta_lock do the work of updating our struct
/* Let ocfs2_inode_lock do the work of updating our struct
* inode for us. */
status = ocfs2_meta_lock(inode, NULL, 0);
status = ocfs2_inode_lock(inode, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto bail;
}
ocfs2_meta_unlock(inode, 0);
ocfs2_inode_unlock(inode, 0);
bail:
mlog_exit(status);

View file

@ -34,8 +34,7 @@ struct ocfs2_inode_info
u64 ip_blkno;
struct ocfs2_lock_res ip_rw_lockres;
struct ocfs2_lock_res ip_meta_lockres;
struct ocfs2_lock_res ip_data_lockres;
struct ocfs2_lock_res ip_inode_lockres;
struct ocfs2_lock_res ip_open_lockres;
/* protects allocation changes on this inode. */
@ -121,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode);
void ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x4
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
#define OCFS2_FI_FLAG_SYSFILE 0x1
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,

View file

@ -20,6 +20,7 @@
#include "ocfs2_fs.h"
#include "ioctl.h"
#include "resize.h"
#include <linux/ext2_fs.h>
@ -27,14 +28,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
int status;
status = ocfs2_meta_lock(inode, NULL, 0);
status = ocfs2_inode_lock(inode, NULL, 0);
if (status < 0) {
mlog_errno(status);
return status;
}
ocfs2_get_inode_flags(OCFS2_I(inode));
*flags = OCFS2_I(inode)->ip_attr;
ocfs2_meta_unlock(inode, 0);
ocfs2_inode_unlock(inode, 0);
mlog_exit(status);
return status;
@ -52,7 +53,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
mutex_lock(&inode->i_mutex);
status = ocfs2_meta_lock(inode, &bh, 1);
status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
mlog_errno(status);
goto bail;
@ -100,7 +101,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
ocfs2_commit_trans(osb, handle);
bail_unlock:
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
bail:
mutex_unlock(&inode->i_mutex);
@ -115,8 +116,10 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
unsigned int cmd, unsigned long arg)
{
unsigned int flags;
int new_clusters;
int status;
struct ocfs2_space_resv sr;
struct ocfs2_new_group_input input;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@ -140,6 +143,23 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
return -EFAULT;
return ocfs2_change_file_space(filp, cmd, &sr);
case OCFS2_IOC_GROUP_EXTEND:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
if (get_user(new_clusters, (int __user *)arg))
return -EFAULT;
return ocfs2_group_extend(inode, new_clusters);
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
return -EFAULT;
return ocfs2_group_add(inode, &input);
default:
return -ENOTTY;
}
@ -162,6 +182,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC_RESVSP64:
case OCFS2_IOC_UNRESVSP:
case OCFS2_IOC_UNRESVSP64:
case OCFS2_IOC_GROUP_EXTEND:
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
break;
default:
return -ENOIOCTLCMD;

View file

@ -44,7 +44,6 @@
#include "localalloc.h"
#include "slot_map.h"
#include "super.h"
#include "vote.h"
#include "sysfile.h"
#include "buffer_head_io.h"
@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
journal->j_trans_id, flushed);
ocfs2_kick_vote_thread(osb);
ocfs2_wake_downconvert_thread(osb);
wake_up(&journal->j_checkpointed);
finally:
mlog_exit(status);
@ -314,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle,
return err;
}
#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5)
#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
void ocfs2_set_journal_params(struct ocfs2_super *osb)
{
journal_t *journal = osb->journal->j_journal;
unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
if (osb->osb_commit_interval)
commit_interval = osb->osb_commit_interval;
spin_lock(&journal->j_state_lock);
journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
journal->j_commit_interval = commit_interval;
if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
journal->j_flags |= JFS_BARRIER;
else
@ -337,7 +340,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
struct ocfs2_dinode *di = NULL;
struct buffer_head *bh = NULL;
struct ocfs2_super *osb;
int meta_lock = 0;
int inode_lock = 0;
mlog_entry_void();
@ -367,14 +370,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
/* Skip recovery waits here - journal inode metadata never
* changes in a live cluster so it can be considered an
* exception to the rule. */
status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
if (status != -ERESTARTSYS)
mlog(ML_ERROR, "Could not get lock on journal!\n");
goto done;
}
meta_lock = 1;
inode_lock = 1;
di = (struct ocfs2_dinode *)bh->b_data;
if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
@ -414,8 +417,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
status = 0;
done:
if (status < 0) {
if (meta_lock)
ocfs2_meta_unlock(inode, 1);
if (inode_lock)
ocfs2_inode_unlock(inode, 1);
if (bh != NULL)
brelse(bh);
if (inode) {
@ -544,7 +547,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
OCFS2_I(inode)->ip_open_count--;
/* unlock our journal */
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
brelse(journal->j_bh);
journal->j_bh = NULL;
@ -883,8 +886,8 @@ restart:
ocfs2_super_unlock(osb, 1);
/* We always run recovery on our own orphan dir - the dead
* node(s) may have voted "no" on an inode delete earlier. A
* revote is therefore required. */
* node(s) may have disallowd a previos inode delete. Re-processing
* is therefore required. */
ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
NULL);
@ -973,9 +976,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
}
SET_INODE_JOURNAL(inode);
status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
if (status != -ERESTARTSYS)
mlog(ML_ERROR, "Could not lock journal!\n");
goto done;
@ -1047,7 +1050,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
done:
/* drop the lock on this nodes journal */
if (got_lock)
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
if (inode)
iput(inode);
@ -1162,14 +1165,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
SET_INODE_JOURNAL(inode);
flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
status = ocfs2_meta_lock_full(inode, NULL, 1, flags);
status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
if (status < 0) {
if (status != -EAGAIN)
mlog_errno(status);
goto bail;
}
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
bail:
if (inode)
iput(inode);
@ -1241,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget(p->osb, ino,
OCFS2_FI_FLAG_ORPHAN_RECOVERY);
OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
if (IS_ERR(iter))
return 0;
@ -1277,7 +1280,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
}
mutex_lock(&orphan_dir_inode->i_mutex);
status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0);
status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
if (status < 0) {
mlog_errno(status);
goto out;
@ -1293,7 +1296,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
*head = priv.head;
out_cluster:
ocfs2_meta_unlock(orphan_dir_inode, 0);
ocfs2_inode_unlock(orphan_dir_inode, 0);
out:
mutex_unlock(&orphan_dir_inode->i_mutex);
iput(orphan_dir_inode);
@ -1380,10 +1383,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
spin_lock(&oi->ip_lock);
/* Delete voting may have set these on the assumption
* that the other node would wipe them successfully.
* If they are still in the node's orphan dir, we need
* to reset that state. */
/* The remote delete code may have set these on the
* assumption that the other node would wipe them
* successfully. If they are still in the node's
* orphan dir, we need to reset that state. */
oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
/* Set the proper information to get us going into

View file

@ -278,6 +278,12 @@ int ocfs2_journal_dirty_data(handle_t *handle,
/* simple file updates like chmod, etc. */
#define OCFS2_INODE_UPDATE_CREDITS 1
/* group extend. inode update and last group update. */
#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
/* group add. inode update and the new group update. */
#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
/* get one bit out of a suballocator: dinode + group descriptor +
* prev. group desc. if we relink. */
#define OCFS2_SUBALLOC_ALLOC (3)

View file

@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
/*
* Determine how large our local alloc window should be, in bits.
*
* These values (and the behavior in ocfs2_alloc_should_use_local) have
* been chosen so that most allocations, including new block groups go
* through local alloc.
*/
static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
{
BUG_ON(osb->s_clustersize_bits < 12);
BUG_ON(osb->s_clustersize_bits > 20);
return 2048 >> (osb->s_clustersize_bits - 12);
/* Size local alloc windows by the megabyte */
return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
}
/*
@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
{
int la_bits = ocfs2_local_alloc_window_bits(osb);
int ret = 0;
if (osb->local_alloc_state != OCFS2_LA_ENABLED)
return 0;
goto bail;
/* la_bits should be at least twice the size (in clusters) of
* a new block group. We want to be sure block group
* allocations go through the local alloc, so allow an
* allocation to take up to half the bitmap. */
if (bits > (la_bits / 2))
return 0;
goto bail;
return 1;
ret = 1;
bail:
mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
return ret;
}
int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
mlog_entry_void();
if (ocfs2_mount_local(osb))
goto bail;
if (osb->local_alloc_size == 0)
goto bail;
if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
mlog(ML_NOTICE, "Requested local alloc window %d is larger "
"than max possible %u. Using defaults.\n",
ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
}
/* read the alloc off disk */
inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
osb->slot_num);
@ -181,6 +193,9 @@ bail:
if (inode)
iput(inode);
mlog(0, "Local alloc window bits = %d\n",
ocfs2_local_alloc_window_bits(osb));
mlog_exit(status);
return status;
}
@ -231,7 +246,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
mutex_lock(&main_bm_inode->i_mutex);
status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
mlog_errno(status);
goto out_mutex;
@ -286,7 +301,7 @@ out_unlock:
if (main_bm_bh)
brelse(main_bm_bh);
ocfs2_meta_unlock(main_bm_inode, 1);
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
@ -399,7 +414,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
mutex_lock(&main_bm_inode->i_mutex);
status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
mlog_errno(status);
goto out_mutex;
@ -424,7 +439,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
ocfs2_commit_trans(osb, handle);
out_unlock:
ocfs2_meta_unlock(main_bm_inode, 1);
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
@ -521,6 +536,9 @@ bail:
iput(local_alloc_inode);
}
mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
status);
mlog_exit(status);
return status;
}

125
fs/ocfs2/locks.c Normal file
View file

@ -0,0 +1,125 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* locks.c
*
* Userspace file locking support
*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "dlmglue.h"
#include "file.h"
#include "locks.h"
static int ocfs2_do_flock(struct file *file, struct inode *inode,
int cmd, struct file_lock *fl)
{
int ret = 0, level = 0, trylock = 0;
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
if (fl->fl_type == F_WRLCK)
level = 1;
if (!IS_SETLKW(cmd))
trylock = 1;
mutex_lock(&fp->fp_mutex);
if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
lockres->l_level > LKM_NLMODE) {
int old_level = 0;
if (lockres->l_level == LKM_EXMODE)
old_level = 1;
if (level == old_level)
goto out;
/*
* Converting an existing lock is not guaranteed to be
* atomic, so we can get away with simply unlocking
* here and allowing the lock code to try at the new
* level.
*/
flock_lock_file_wait(file,
&(struct file_lock){.fl_type = F_UNLCK});
ocfs2_file_unlock(file);
}
ret = ocfs2_file_lock(file, level, trylock);
if (ret) {
if (ret == -EAGAIN && trylock)
ret = -EWOULDBLOCK;
else
mlog_errno(ret);
goto out;
}
ret = flock_lock_file_wait(file, fl);
out:
mutex_unlock(&fp->fp_mutex);
return ret;
}
static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
{
int ret;
struct ocfs2_file_private *fp = file->private_data;
mutex_lock(&fp->fp_mutex);
ocfs2_file_unlock(file);
ret = flock_lock_file_wait(file, fl);
mutex_unlock(&fp->fp_mutex);
return ret;
}
/*
* Overall flow of ocfs2_flock() was influenced by gfs2_flock().
*/
int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
if (__mandatory_lock(inode))
return -ENOLCK;
if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
ocfs2_mount_local(osb))
return flock_lock_file_wait(file, fl);
if (fl->fl_type == F_UNLCK)
return ocfs2_do_funlock(file, cmd, fl);
else
return ocfs2_do_flock(file, inode, cmd, fl);
}

View file

@ -1,9 +1,9 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* vote.h
* locks.h
*
* description here
* Function prototypes for Userspace file locking support
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
@ -23,26 +23,9 @@
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_LOCKS_H
#define OCFS2_LOCKS_H
#ifndef VOTE_H
#define VOTE_H
int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
int ocfs2_vote_thread(void *arg);
static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
{
spin_lock(&osb->vote_task_lock);
/* make sure the voting thread gets a swipe at whatever changes
* the caller may have made to the voting state */
osb->vote_wake_sequence++;
spin_unlock(&osb->vote_task_lock);
wake_up(&osb->vote_event);
}
int ocfs2_request_mount_vote(struct ocfs2_super *osb);
int ocfs2_request_umount_vote(struct ocfs2_super *osb);
int ocfs2_register_net_handlers(struct ocfs2_super *osb);
void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
int node_num);
#endif
#endif /* OCFS2_LOCKS_H */

View file

@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
* node. Taking the data lock will also ensure that we don't
* attempt page truncation as part of a downconvert.
*/
ret = ocfs2_meta_lock(inode, &di_bh, 1);
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
goto out;
@ -181,21 +181,12 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_data_lock(inode, 1);
if (ret < 0) {
mlog_errno(ret);
goto out_meta_unlock;
}
ret = __ocfs2_page_mkwrite(inode, di_bh, page);
ocfs2_data_unlock(inode, 1);
out_meta_unlock:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
brelse(di_bh);
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
out:
ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
@ -214,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret = 0, lock_level = 0;
ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
file->f_vfsmnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
out:
vma->vm_ops = &ocfs2_file_vm_ops;
vma->vm_flags |= VM_CAN_NONLINEAR;

View file

@ -60,7 +60,6 @@
#include "symlink.h"
#include "sysfile.h"
#include "uptodate.h"
#include "vote.h"
#include "buffer_head_io.h"
@ -116,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
status = ocfs2_meta_lock(dir, NULL, 0);
status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -129,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
if (status < 0)
goto bail_add;
inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
if (IS_ERR(inode)) {
ret = ERR_PTR(-EACCES);
goto bail_unlock;
@ -176,8 +175,8 @@ bail_unlock:
/* Don't drop the cluster lock until *after* the d_add --
* unlink on another node will message us to remove that
* dentry under this lock so otherwise we can race this with
* the vote thread and have a stale dentry. */
ocfs2_meta_unlock(dir, 0);
* the downconvert thread and have a stale dentry. */
ocfs2_inode_unlock(dir, 0);
bail:
@ -209,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir,
/* get our super block */
osb = OCFS2_SB(dir->i_sb);
status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -323,7 +322,7 @@ leave:
if (handle)
ocfs2_commit_trans(osb, handle);
ocfs2_meta_unlock(dir, 1);
ocfs2_inode_unlock(dir, 1);
if (status == -ENOSPC)
mlog(0, "Disk is full\n");
@ -553,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry,
if (S_ISDIR(inode->i_mode))
return -EPERM;
err = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
@ -578,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out;
}
err = ocfs2_meta_lock(inode, &fe_bh, 1);
err = ocfs2_inode_lock(inode, &fe_bh, 1);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
@ -643,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry,
out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock_inode:
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
out:
ocfs2_meta_unlock(dir, 1);
ocfs2_inode_unlock(dir, 1);
if (de_bh)
brelse(de_bh);
@ -720,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir,
return -EPERM;
}
status = ocfs2_meta_lock(dir, &parent_node_bh, 1);
status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -745,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
status = ocfs2_meta_lock(inode, &fe_bh, 1);
status = ocfs2_inode_lock(inode, &fe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir,
status = ocfs2_remote_dentry_delete(dentry);
if (status < 0) {
/* This vote should succeed under all normal
/* This remote delete should succeed under all normal
* circumstances. */
mlog_errno(status);
goto leave;
@ -841,13 +840,13 @@ leave:
ocfs2_commit_trans(osb, handle);
if (child_locked)
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
ocfs2_meta_unlock(dir, 1);
ocfs2_inode_unlock(dir, 1);
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_meta_unlock(orphan_dir, 1);
ocfs2_inode_unlock(orphan_dir, 1);
mutex_unlock(&orphan_dir->i_mutex);
iput(orphan_dir);
}
@ -908,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
inode1 = tmpinode;
}
/* lock id2 */
status = ocfs2_meta_lock(inode2, bh2, 1);
status = ocfs2_inode_lock(inode2, bh2, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -917,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
/* lock id1 */
status = ocfs2_meta_lock(inode1, bh1, 1);
status = ocfs2_inode_lock(inode1, bh1, 1);
if (status < 0) {
/*
* An error return must mean that no cluster locks
* were held on function exit.
*/
if (oi1->ip_blkno != oi2->ip_blkno)
ocfs2_meta_unlock(inode2, 1);
ocfs2_inode_unlock(inode2, 1);
if (status != -ENOENT)
mlog_errno(status);
@ -937,10 +936,10 @@ bail:
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
{
ocfs2_meta_unlock(inode1, 1);
ocfs2_inode_unlock(inode1, 1);
if (inode1 != inode2)
ocfs2_meta_unlock(inode2, 1);
ocfs2_inode_unlock(inode2, 1);
}
static int ocfs2_rename(struct inode *old_dir,
@ -1031,10 +1030,11 @@ static int ocfs2_rename(struct inode *old_dir,
/*
* Aside from allowing a meta data update, the locking here
* also ensures that the vote thread on other nodes won't have
* to concurrently downconvert the inode and the dentry locks.
* also ensures that the downconvert thread on other nodes
* won't have to concurrently downconvert the inode and the
* dentry locks.
*/
status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1);
status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -1143,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
status = ocfs2_meta_lock(new_inode, &newfe_bh, 1);
status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -1355,14 +1355,14 @@ bail:
ocfs2_double_unlock(old_dir, new_dir);
if (old_child_locked)
ocfs2_meta_unlock(old_inode, 1);
ocfs2_inode_unlock(old_inode, 1);
if (new_child_locked)
ocfs2_meta_unlock(new_inode, 1);
ocfs2_inode_unlock(new_inode, 1);
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_meta_unlock(orphan_dir, 1);
ocfs2_inode_unlock(orphan_dir, 1);
mutex_unlock(&orphan_dir->i_mutex);
iput(orphan_dir);
}
@ -1530,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir,
credits = ocfs2_calc_symlink_credits(sb);
/* lock the parent directory */
status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@ -1657,7 +1657,7 @@ bail:
if (handle)
ocfs2_commit_trans(osb, handle);
ocfs2_meta_unlock(dir, 1);
ocfs2_inode_unlock(dir, 1);
if (new_fe_bh)
brelse(new_fe_bh);
@ -1735,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
mutex_lock(&orphan_dir_inode->i_mutex);
status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
mlog_errno(status);
goto leave;
@ -1745,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
orphan_dir_bh, name,
OCFS2_ORPHAN_NAMELEN, de_bh);
if (status < 0) {
ocfs2_meta_unlock(orphan_dir_inode, 1);
ocfs2_inode_unlock(orphan_dir_inode, 1);
mlog_errno(status);
goto leave;

View file

@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
* about to be
* dropped. */
#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
struct ocfs2_lock_res_ops;
@ -170,6 +171,7 @@ enum ocfs2_mount_options
OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@ -189,9 +191,7 @@ struct ocfs2_super
struct ocfs2_slot_info *slot_info;
spinlock_t node_map_lock;
struct ocfs2_node_map mounted_map;
struct ocfs2_node_map recovery_map;
struct ocfs2_node_map umount_map;
u64 root_blkno;
u64 system_dir_blkno;
@ -231,7 +231,9 @@ struct ocfs2_super
wait_queue_head_t checkpoint_event;
atomic_t needs_checkpoint;
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
int local_alloc_size;
enum ocfs2_local_alloc_state local_alloc_state;
struct buffer_head *local_alloc_bh;
u64 la_last_gd;
@ -254,28 +256,21 @@ struct ocfs2_super
wait_queue_head_t recovery_event;
spinlock_t vote_task_lock;
struct task_struct *vote_task;
wait_queue_head_t vote_event;
unsigned long vote_wake_sequence;
unsigned long vote_work_sequence;
spinlock_t dc_task_lock;
struct task_struct *dc_task;
wait_queue_head_t dc_event;
unsigned long dc_wake_sequence;
unsigned long dc_work_sequence;
/*
* Any thread can add locks to the list, but the downconvert
* thread is the only one allowed to remove locks. Any change
* to this rule requires updating
* ocfs2_downconvert_thread_do_work().
*/
struct list_head blocked_lock_list;
unsigned long blocked_lock_count;
struct list_head vote_list;
int vote_count;
u32 net_key;
spinlock_t net_response_lock;
unsigned int net_response_ids;
struct list_head net_response_list;
struct o2hb_callback_func osb_hb_up;
struct o2hb_callback_func osb_hb_down;
struct list_head osb_net_handlers;
wait_queue_head_t osb_mount_event;
/* Truncate log info */

View file

@ -231,6 +231,20 @@ struct ocfs2_space_resv {
#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
/* Used to pass group descriptor data when online resize is done */
struct ocfs2_new_group_input {
__u64 group; /* Group descriptor's blkno. */
__u32 clusters; /* Total number of clusters in this group */
__u32 frees; /* Total free clusters in this group */
__u16 chain; /* Chain for this group */
__u16 reserved1;
__u32 reserved2;
};
#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
/*
* Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
*/
@ -256,6 +270,14 @@ struct ocfs2_space_resv {
/* Journal limits (in bytes) */
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
/*
* Default local alloc size (in megabytes)
*
* The value chosen should be such that most allocations, including new
* block groups, use local alloc.
*/
#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
struct ocfs2_system_inode_info {
char *si_name;
int si_iflags;

View file

@ -45,6 +45,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_RW,
OCFS2_LOCK_TYPE_DENTRY,
OCFS2_LOCK_TYPE_OPEN,
OCFS2_LOCK_TYPE_FLOCK,
OCFS2_NUM_LOCK_TYPES
};
@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_OPEN:
c = 'O';
break;
case OCFS2_LOCK_TYPE_FLOCK:
c = 'F';
break;
default:
c = '\0';
}
@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
[OCFS2_LOCK_TYPE_OPEN] = "Open",
[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)

634
fs/ocfs2/resize.c Normal file
View file

@ -0,0 +1,634 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* resize.c
*
* volume resize.
* Inspired by ext3/resize.c.
*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
#include "inode.h"
#include "journal.h"
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
#include "buffer_head_io.h"
#include "suballoc.h"
#include "resize.h"
/*
* Check whether there are new backup superblocks exist
* in the last group. If there are some, mark them or clear
* them in the bitmap.
*
* Return how many backups we find in the last group.
*/
static u16 ocfs2_calc_new_backup_super(struct inode *inode,
struct ocfs2_group_desc *gd,
int new_clusters,
u32 first_new_cluster,
u16 cl_cpg,
int set)
{
int i;
u16 backups = 0;
u32 cluster;
u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
gd_blkno = ocfs2_which_cluster_group(inode, cluster);
if (gd_blkno < lgd_blkno)
continue;
else if (gd_blkno > lgd_blkno)
break;
if (set)
ocfs2_set_bit(cluster % cl_cpg,
(unsigned long *)gd->bg_bitmap);
else
ocfs2_clear_bit(cluster % cl_cpg,
(unsigned long *)gd->bg_bitmap);
backups++;
}
mlog_exit_void();
return backups;
}
static int ocfs2_update_last_group_and_inode(handle_t *handle,
struct inode *bm_inode,
struct buffer_head *bm_bh,
struct buffer_head *group_bh,
u32 first_new_cluster,
int new_clusters)
{
int ret = 0;
struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
struct ocfs2_chain_list *cl = &fe->id2.i_chain;
struct ocfs2_chain_rec *cr;
struct ocfs2_group_desc *group;
u16 chain, num_bits, backups = 0;
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
new_clusters, first_new_cluster);
ret = ocfs2_journal_access(handle, bm_inode, group_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
group = (struct ocfs2_group_desc *)group_bh->b_data;
/* update the group first. */
num_bits = new_clusters * cl_bpc;
le16_add_cpu(&group->bg_bits, num_bits);
le16_add_cpu(&group->bg_free_bits_count, num_bits);
/*
* check whether there are some new backup superblocks exist in
* this group and update the group bitmap accordingly.
*/
if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
backups = ocfs2_calc_new_backup_super(bm_inode,
group,
new_clusters,
first_new_cluster,
cl_cpg, 1);
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
ret = ocfs2_journal_dirty(handle, group_bh);
if (ret < 0) {
mlog_errno(ret);
goto out_rollback;
}
/* update the inode accordingly. */
ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_rollback;
}
chain = le16_to_cpu(group->bg_chain);
cr = (&cl->cl_recs[chain]);
le32_add_cpu(&cr->c_total, num_bits);
le32_add_cpu(&cr->c_free, num_bits);
le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
le32_add_cpu(&fe->i_clusters, new_clusters);
if (backups) {
le32_add_cpu(&cr->c_free, -1 * backups);
le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
}
spin_lock(&OCFS2_I(bm_inode)->ip_lock);
OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
i_size_write(bm_inode, le64_to_cpu(fe->i_size));
ocfs2_journal_dirty(handle, bm_bh);
out_rollback:
if (ret < 0) {
ocfs2_calc_new_backup_super(bm_inode,
group,
new_clusters,
first_new_cluster,
cl_cpg, 0);
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
}
out:
mlog_exit(ret);
return ret;
}
static int update_backups(struct inode * inode, u32 clusters, char *data)
{
int i, ret = 0;
u32 cluster;
u64 blkno;
struct buffer_head *backup = NULL;
struct ocfs2_dinode *backup_di = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
/* calculate the real backups we need to update. */
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
if (cluster > clusters)
break;
ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
if (ret < 0) {
mlog_errno(ret);
break;
}
memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
backup_di = (struct ocfs2_dinode *)backup->b_data;
backup_di->i_blkno = cpu_to_le64(blkno);
ret = ocfs2_write_super_or_backup(osb, backup);
brelse(backup);
backup = NULL;
if (ret < 0) {
mlog_errno(ret);
break;
}
}
return ret;
}
static void ocfs2_update_super_and_backups(struct inode *inode,
int new_clusters)
{
int ret;
u32 clusters = 0;
struct buffer_head *super_bh = NULL;
struct ocfs2_dinode *super_di = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
/*
* update the superblock last.
* It doesn't matter if the write failed.
*/
ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
&super_bh, 0, NULL);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
super_di = (struct ocfs2_dinode *)super_bh->b_data;
le32_add_cpu(&super_di->i_clusters, new_clusters);
clusters = le32_to_cpu(super_di->i_clusters);
ret = ocfs2_write_super_or_backup(osb, super_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
ret = update_backups(inode, clusters, super_bh->b_data);
out:
brelse(super_bh);
if (ret)
printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
" during fs resize. This condition is not fatal,"
" but fsck.ocfs2 should be run to fix it\n",
osb->dev_str);
return;
}
/*
* Extend the filesystem to the new number of clusters specified. This entry
* point is only used to extend the current filesystem to the end of the last
* existing group.
*/
int ocfs2_group_extend(struct inode * inode, int new_clusters)
{
int ret;
handle_t *handle;
struct buffer_head *main_bm_bh = NULL;
struct buffer_head *group_bh = NULL;
struct inode *main_bm_inode = NULL;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_group_desc *group = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u16 cl_bpc;
u32 first_new_cluster;
u64 lgd_blkno;
mlog_entry_void();
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
if (new_clusters < 0)
return -EINVAL;
else if (new_clusters == 0)
return 0;
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!main_bm_inode) {
ret = -EINVAL;
mlog_errno(ret);
goto out;
}
mutex_lock(&main_bm_inode->i_mutex);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (ret < 0) {
mlog_errno(ret);
goto out_mutex;
}
fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
ocfs2_group_bitmap_size(osb->sb) * 8) {
mlog(ML_ERROR, "The disk is too old and small. "
"Force to do offline resize.");
ret = -EINVAL;
goto out_unlock;
}
if (!OCFS2_IS_VALID_DINODE(fe)) {
OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
ret = -EIO;
goto out_unlock;
}
first_new_cluster = le32_to_cpu(fe->i_clusters);
lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
first_new_cluster - 1);
ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
main_bm_inode);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
}
group = (struct ocfs2_group_desc *)group_bh->b_data;
ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
if (ret) {
mlog_errno(ret);
goto out_unlock;
}
cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
ret = -EINVAL;
goto out_unlock;
}
mlog(0, "extend the last group at %llu, new clusters = %d\n",
(unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
ret = -EINVAL;
goto out_unlock;
}
/* update the last group descriptor and inode. */
ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
main_bm_bh, group_bh,
first_new_cluster,
new_clusters);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock:
brelse(group_bh);
brelse(main_bm_bh);
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
iput(main_bm_inode);
out:
mlog_exit_void();
return ret;
}
static int ocfs2_check_new_group(struct inode *inode,
struct ocfs2_dinode *di,
struct ocfs2_new_group_input *input,
struct buffer_head *group_bh)
{
int ret;
struct ocfs2_group_desc *gd;
u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
le16_to_cpu(di->id2.i_chain.cl_bpc);
gd = (struct ocfs2_group_desc *)group_bh->b_data;
ret = -EIO;
if (!OCFS2_IS_VALID_GROUP_DESC(gd))
mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno));
else if (di->i_blkno != gd->bg_parent_dinode)
mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
"pointer (%llu, expected %llu)\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
(unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
(unsigned long long)le64_to_cpu(di->i_blkno));
else if (le16_to_cpu(gd->bg_bits) > max_bits)
mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
le16_to_cpu(gd->bg_bits));
else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
"claims that %u are free\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
le16_to_cpu(gd->bg_bits),
le16_to_cpu(gd->bg_free_bits_count));
else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
"max bitmap bits of %u\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
le16_to_cpu(gd->bg_bits),
8 * le16_to_cpu(gd->bg_size));
else if (le16_to_cpu(gd->bg_chain) != input->chain)
mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
"while input has %u set.\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
le16_to_cpu(gd->bg_chain), input->chain);
else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
"input has %u clusters set\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
le16_to_cpu(gd->bg_bits), input->clusters);
else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
"but it should have %u set\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
le16_to_cpu(gd->bg_bits),
input->frees * cl_bpc);
else
ret = 0;
return ret;
}
static int ocfs2_verify_group_and_input(struct inode *inode,
struct ocfs2_dinode *di,
struct ocfs2_new_group_input *input,
struct buffer_head *group_bh)
{
u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
u32 total_clusters = le32_to_cpu(di->i_clusters);
int ret = -EINVAL;
if (cluster < total_clusters)
mlog(ML_ERROR, "add a group which is in the current volume.\n");
else if (input->chain >= cl_count)
mlog(ML_ERROR, "input chain exceeds the limit.\n");
else if (next_free != cl_count && next_free != input->chain)
mlog(ML_ERROR,
"the add group should be in chain %u\n", next_free);
else if (total_clusters + input->clusters < total_clusters)
mlog(ML_ERROR, "add group's clusters overflow.\n");
else if (input->clusters > cl_cpg)
mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
else if (input->frees > input->clusters)
mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
else if (total_clusters % cl_cpg != 0)
mlog(ML_ERROR,
"the last group isn't full. Use group extend first.\n");
else if (input->group != ocfs2_which_cluster_group(inode, cluster))
mlog(ML_ERROR, "group blkno is invalid\n");
else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
mlog(ML_ERROR, "group descriptor check failed.\n");
else
ret = 0;
return ret;
}
/* Add a new group descriptor to global_bitmap. */
int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
{
int ret;
handle_t *handle;
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode = NULL;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *group = NULL;
struct ocfs2_chain_list *cl;
struct ocfs2_chain_rec *cr;
u16 cl_bpc;
mlog_entry_void();
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!main_bm_inode) {
ret = -EINVAL;
mlog_errno(ret);
goto out;
}
mutex_lock(&main_bm_inode->i_mutex);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (ret < 0) {
mlog_errno(ret);
goto out_mutex;
}
fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
ocfs2_group_bitmap_size(osb->sb) * 8) {
mlog(ML_ERROR, "The disk is too old and small."
" Force to do offline resize.");
ret = -EINVAL;
goto out_unlock;
}
ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
if (ret < 0) {
mlog(ML_ERROR, "Can't read the group descriptor # %llu "
"from the device.", (unsigned long long)input->group);
goto out_unlock;
}
ocfs2_set_new_buffer_uptodate(inode, group_bh);
ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
if (ret) {
mlog_errno(ret);
goto out_unlock;
}
mlog(0, "Add a new group %llu in chain = %u, length = %u\n",
(unsigned long long)input->group, input->chain, input->clusters);
handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
ret = -EINVAL;
goto out_unlock;
}
cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
cl = &fe->id2.i_chain;
cr = &cl->cl_recs[input->chain];
ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
group = (struct ocfs2_group_desc *)group_bh->b_data;
group->bg_next_group = cr->c_blkno;
ret = ocfs2_journal_dirty(handle, group_bh);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
le16_add_cpu(&cl->cl_next_free_rec, 1);
memset(cr, 0, sizeof(struct ocfs2_chain_rec));
}
cr->c_blkno = le64_to_cpu(input->group);
le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
le32_add_cpu(&fe->id1.bitmap1.i_used,
(input->clusters - input->frees) * cl_bpc);
le32_add_cpu(&fe->i_clusters, input->clusters);
ocfs2_journal_dirty(handle, main_bm_bh);
spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock:
brelse(group_bh);
brelse(main_bm_bh);
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
iput(main_bm_inode);
out:
mlog_exit_void();
return ret;
}

32
fs/ocfs2/resize.h Normal file
View file

@ -0,0 +1,32 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* resize.h
*
* Function prototypes
*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_RESIZE_H
#define OCFS2_RESIZE_H
int ocfs2_group_extend(struct inode * inode, int new_clusters);
int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
#endif /* OCFS2_RESIZE_H */

View file

@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
s16 slot_num,
s16 node_num);
/* Use the slot information we've collected to create a map of mounted
* nodes. Should be holding an EX on super block. assumes slot info is
* up to date. Note that we call this *after* we find a slot, so our
* own node should be set in the map too... */
void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
{
int i;
struct ocfs2_slot_info *si = osb->slot_info;
spin_lock(&si->si_lock);
for (i = 0; i < si->si_size; i++)
if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
ocfs2_node_map_set_bit(osb, &osb->mounted_map,
si->si_global_node_nums[i]);
spin_unlock(&si->si_lock);
}
/* post the slot information on disk into our slot_info struct. */
void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
{

View file

@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
void ocfs2_clear_slot(struct ocfs2_slot_info *si,
s16 slot_num);
void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
int slot_num)
{

View file

@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
u64 bg_blkno,
u16 bg_bit_off);
static inline u64 ocfs2_which_cluster_group(struct inode *inode,
u32 cluster);
static inline void ocfs2_block_to_cluster_group(struct inode *inode,
u64 data_blkno,
u64 *bg_blkno,
@ -114,7 +112,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
if (inode) {
if (ac->ac_which != OCFS2_AC_USE_LOCAL)
ocfs2_meta_unlock(inode, 1);
ocfs2_inode_unlock(inode, 1);
mutex_unlock(&inode->i_mutex);
@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
}
/* somewhat more expensive than our other checks, so use sparingly. */
static int ocfs2_check_group_descriptor(struct super_block *sb,
struct ocfs2_dinode *di,
struct ocfs2_group_desc *gd)
int ocfs2_check_group_descriptor(struct super_block *sb,
struct ocfs2_dinode *di,
struct ocfs2_group_desc *gd)
{
unsigned int max_bits;
@ -412,7 +410,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
mutex_lock(&alloc_inode->i_mutex);
status = ocfs2_meta_lock(alloc_inode, &bh, 1);
status = ocfs2_inode_lock(alloc_inode, &bh, 1);
if (status < 0) {
mutex_unlock(&alloc_inode->i_mutex);
iput(alloc_inode);
@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
/* given a cluster offset, calculate which block group it belongs to
* and return that block offset. */
static inline u64 ocfs2_which_cluster_group(struct inode *inode,
u32 cluster)
u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u32 group_no;
@ -1519,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
if (min_clusters > (osb->bitmap_cpg - 1)) {
/* The only paths asking for contiguousness
* should know about this already. */
mlog(ML_ERROR, "minimum allocation requested exceeds "
"group bitmap size!");
mlog(ML_ERROR, "minimum allocation requested %u exceeds "
"group bitmap size %u!\n", min_clusters,
osb->bitmap_cpg);
status = -ENOSPC;
goto bail;
}

View file

@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac);
/* given a cluster offset, calculate which block group it belongs to
* and return that block offset. */
u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
/* somewhat more expensive than our other checks, so use sparingly. */
int ocfs2_check_group_descriptor(struct super_block *sb,
struct ocfs2_dinode *di,
struct ocfs2_group_desc *gd);
#endif /* _CHAINALLOC_H_ */

View file

@ -65,7 +65,6 @@
#include "sysfile.h"
#include "uptodate.h"
#include "ver.h"
#include "vote.h"
#include "buffer_head_io.h"
@ -84,9 +83,11 @@ MODULE_LICENSE("GPL");
struct mount_options
{
unsigned long commit_interval;
unsigned long mount_opt;
unsigned int atime_quantum;
signed short slot;
unsigned int localalloc_opt;
};
static int ocfs2_parse_options(struct super_block *sb, char *options,
@ -150,6 +151,9 @@ enum {
Opt_data_writeback,
Opt_atime_quantum,
Opt_slot,
Opt_commit,
Opt_localalloc,
Opt_localflocks,
Opt_err,
};
@ -165,6 +169,9 @@ static match_table_t tokens = {
{Opt_data_writeback, "data=writeback"},
{Opt_atime_quantum, "atime_quantum=%u"},
{Opt_slot, "preferred_slot=%u"},
{Opt_commit, "commit=%u"},
{Opt_localalloc, "localalloc=%d"},
{Opt_localflocks, "localflocks"},
{Opt_err, NULL}
};
@ -213,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
mlog_entry_void();
new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE);
new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
if (IS_ERR(new)) {
status = PTR_ERR(new);
mlog_errno(status);
@ -221,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
}
osb->root_inode = new;
new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE);
new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
if (IS_ERR(new)) {
status = PTR_ERR(new);
mlog_errno(status);
@ -443,6 +450,8 @@ unlock_osb:
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
if (parsed_options.commit_interval)
osb->osb_commit_interval = parsed_options.commit_interval;
if (!ocfs2_is_hard_readonly(osb))
ocfs2_set_journal_params(osb);
@ -597,6 +606,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
osb->osb_commit_interval = parsed_options.commit_interval;
osb->local_alloc_size = parsed_options.localalloc_opt;
sb->s_magic = OCFS2_SUPER_MAGIC;
@ -747,9 +758,11 @@ static int ocfs2_parse_options(struct super_block *sb,
mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
options ? options : "(none)");
mopt->commit_interval = 0;
mopt->mount_opt = 0;
mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
mopt->slot = OCFS2_INVALID_SLOT;
mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
if (!options) {
status = 1;
@ -816,6 +829,41 @@ static int ocfs2_parse_options(struct super_block *sb,
if (option)
mopt->slot = (s16)option;
break;
case Opt_commit:
option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
}
if (option < 0)
return 0;
if (option == 0)
option = JBD_DEFAULT_MAX_COMMIT_AGE;
mopt->commit_interval = HZ * option;
break;
case Opt_localalloc:
option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
}
if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
mopt->localalloc_opt = option;
break;
case Opt_localflocks:
/*
* Changing this during remount could race
* flock() requests, or "unbalance" existing
* ones (e.g., a lock is taken in one mode but
* dropped in the other). If users care enough
* to flip locking modes during remount, we
* could add a "local" flag to individual
* flock structures for proper tracking of
* state.
*/
if (!is_remount)
mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@ -864,6 +912,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
if (osb->osb_commit_interval)
seq_printf(s, ",commit=%u",
(unsigned) (osb->osb_commit_interval / HZ));
if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
if (opts & OCFS2_MOUNT_LOCALFLOCKS)
seq_printf(s, ",localflocks,");
return 0;
}
@ -965,7 +1023,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
goto bail;
}
status = ocfs2_meta_lock(inode, &bh, 0);
status = ocfs2_inode_lock(inode, &bh, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
@ -989,7 +1047,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
brelse(bh);
ocfs2_meta_unlock(inode, 0);
ocfs2_inode_unlock(inode, 0);
status = 0;
bail:
if (inode)
@ -1020,8 +1078,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
oi->ip_clusters = 0;
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
ocfs2_lock_res_init_once(&oi->ip_data_lockres);
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
ocfs2_metadata_cache_init(&oi->vfs_inode);
@ -1117,25 +1174,12 @@ static int ocfs2_mount_volume(struct super_block *sb)
goto leave;
}
status = ocfs2_register_hb_callbacks(osb);
if (status < 0) {
mlog_errno(status);
goto leave;
}
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
goto leave;
}
/* requires vote_thread to be running. */
status = ocfs2_register_net_handlers(osb);
if (status < 0) {
mlog_errno(status);
goto leave;
}
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
@ -1150,8 +1194,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
goto leave;
}
ocfs2_populate_mounted_map(osb);
/* load all node-local system inodes */
status = ocfs2_init_local_system_inodes(osb);
if (status < 0) {
@ -1174,15 +1216,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (ocfs2_mount_local(osb))
goto leave;
/* This should be sent *after* we recovered our journal as it
* will cause other nodes to unmark us as needing
* recovery. However, we need to send it *before* dropping the
* super block lock as otherwise their recovery threads might
* try to clean us up while we're live! */
status = ocfs2_request_mount_vote(osb);
if (status < 0)
mlog_errno(status);
leave:
if (unlock_super)
ocfs2_super_unlock(osb, 1);
@ -1240,10 +1273,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
mlog_errno(tmp);
return;
}
tmp = ocfs2_request_umount_vote(osb);
if (tmp < 0)
mlog_errno(tmp);
}
if (osb->slot_num != OCFS2_INVALID_SLOT)
@ -1254,13 +1283,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_release_system_inodes(osb);
if (osb->dlm) {
ocfs2_unregister_net_handlers(osb);
if (osb->dlm)
ocfs2_dlm_shutdown(osb);
}
ocfs2_clear_hb_callbacks(osb);
debugfs_remove(osb->osb_debug_root);
@ -1315,7 +1339,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
int i, cbits, bbits;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
struct buffer_head *bitmap_bh = NULL;
struct ocfs2_journal *journal;
__le32 uuid_net_key;
struct ocfs2_super *osb;
@ -1344,19 +1367,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->s_sectsize_bits = blksize_bits(sector_size);
BUG_ON(!osb->s_sectsize_bits);
osb->net_response_ids = 0;
spin_lock_init(&osb->net_response_lock);
INIT_LIST_HEAD(&osb->net_response_list);
INIT_LIST_HEAD(&osb->osb_net_handlers);
init_waitqueue_head(&osb->recovery_event);
spin_lock_init(&osb->vote_task_lock);
init_waitqueue_head(&osb->vote_event);
osb->vote_work_sequence = 0;
osb->vote_wake_sequence = 0;
spin_lock_init(&osb->dc_task_lock);
init_waitqueue_head(&osb->dc_event);
osb->dc_work_sequence = 0;
osb->dc_wake_sequence = 0;
INIT_LIST_HEAD(&osb->blocked_lock_list);
osb->blocked_lock_count = 0;
INIT_LIST_HEAD(&osb->vote_list);
spin_lock_init(&osb->osb_lock);
atomic_set(&osb->alloc_stats.moves, 0);
@ -1496,7 +1513,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
osb->net_key = le32_to_cpu(uuid_net_key);
strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
osb->vol_label[63] = '\0';
@ -1539,25 +1555,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
/* We don't have a cluster lock on the bitmap here because
* we're only interested in static information and the extra
* complexity at mount time isn't worht it. Don't pass the
* inode in to the read function though as we don't want it to
* be put in the cache. */
status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
NULL);
iput(inode);
if (status < 0) {
mlog_errno(status);
goto bail;
}
di = (struct ocfs2_dinode *) bitmap_bh->b_data;
osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
brelse(bitmap_bh);
mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
(unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
status = ocfs2_init_slot_info(osb);
if (status < 0) {

View file

@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
goto bail;
}
inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE);
inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
if (IS_ERR(inode)) {
mlog_errno(PTR_ERR(inode));
inode = NULL;

View file

@ -29,7 +29,7 @@
#include "ver.h"
#define OCFS2_BUILD_VERSION "1.3.3"
#define OCFS2_BUILD_VERSION "1.5.0"
#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION

View file

@ -1,756 +0,0 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* vote.c
*
* description here
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <cluster/heartbeat.h>
#include <cluster/nodemanager.h>
#include <cluster/tcp.h>
#include <dlm/dlmapi.h>
#define MLOG_MASK_PREFIX ML_VOTE
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
#include "slot_map.h"
#include "vote.h"
#include "buffer_head_io.h"
#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
struct ocfs2_msg_hdr
{
__be32 h_response_id; /* used to lookup message handle on sending
* node. */
__be32 h_request;
__be64 h_blkno;
__be32 h_generation;
__be32 h_node_num; /* node sending this particular message. */
};
struct ocfs2_vote_msg
{
struct ocfs2_msg_hdr v_hdr;
__be32 v_reserved1;
} __attribute__ ((packed));
/* Responses are given these values to maintain backwards
* compatibility with older ocfs2 versions */
#define OCFS2_RESPONSE_OK (0)
#define OCFS2_RESPONSE_BUSY (-16)
#define OCFS2_RESPONSE_BAD_MSG (-22)
struct ocfs2_response_msg
{
struct ocfs2_msg_hdr r_hdr;
__be32 r_response;
} __attribute__ ((packed));
struct ocfs2_vote_work {
struct list_head w_list;
struct ocfs2_vote_msg w_msg;
};
enum ocfs2_vote_request {
OCFS2_VOTE_REQ_INVALID = 0,
OCFS2_VOTE_REQ_MOUNT,
OCFS2_VOTE_REQ_UMOUNT,
OCFS2_VOTE_REQ_LAST
};
static inline int ocfs2_is_valid_vote_request(int request)
{
return OCFS2_VOTE_REQ_INVALID < request &&
request < OCFS2_VOTE_REQ_LAST;
}
typedef void (*ocfs2_net_response_callback)(void *priv,
struct ocfs2_response_msg *resp);
struct ocfs2_net_response_cb {
ocfs2_net_response_callback rc_cb;
void *rc_priv;
};
struct ocfs2_net_wait_ctxt {
struct list_head n_list;
u32 n_response_id;
wait_queue_head_t n_event;
struct ocfs2_node_map n_node_map;
int n_response; /* an agreggate response. 0 if
* all nodes are go, < 0 on any
* negative response from any
* node or network error. */
struct ocfs2_net_response_cb *n_callback;
};
static void ocfs2_process_mount_request(struct ocfs2_super *osb,
unsigned int node_num)
{
mlog(0, "MOUNT vote from node %u\n", node_num);
/* The other node only sends us this message when he has an EX
* on the superblock, so our recovery threads (if having been
* launched) are waiting on it.*/
ocfs2_recovery_map_clear(osb, node_num);
ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
/* We clear the umount map here because a node may have been
* previously mounted, safely unmounted but never stopped
* heartbeating - in which case we'd have a stale entry. */
ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
}
static void ocfs2_process_umount_request(struct ocfs2_super *osb,
unsigned int node_num)
{
mlog(0, "UMOUNT vote from node %u\n", node_num);
ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
}
static void ocfs2_process_vote(struct ocfs2_super *osb,
struct ocfs2_vote_msg *msg)
{
int net_status, vote_response;
unsigned int node_num;
u64 blkno;
enum ocfs2_vote_request request;
struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
struct ocfs2_response_msg response;
/* decode the network mumbo jumbo into local variables. */
request = be32_to_cpu(hdr->h_request);
blkno = be64_to_cpu(hdr->h_blkno);
node_num = be32_to_cpu(hdr->h_node_num);
mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
request, (unsigned long long)blkno, node_num);
if (!ocfs2_is_valid_vote_request(request)) {
mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
request, node_num);
vote_response = OCFS2_RESPONSE_BAD_MSG;
goto respond;
}
vote_response = OCFS2_RESPONSE_OK;
switch (request) {
case OCFS2_VOTE_REQ_UMOUNT:
ocfs2_process_umount_request(osb, node_num);
goto respond;
case OCFS2_VOTE_REQ_MOUNT:
ocfs2_process_mount_request(osb, node_num);
goto respond;
default:
/* avoids a gcc warning */
break;
}
respond:
/* Response struture is small so we just put it on the stack
* and stuff it inline. */
memset(&response, 0, sizeof(struct ocfs2_response_msg));
response.r_hdr.h_response_id = hdr->h_response_id;
response.r_hdr.h_blkno = hdr->h_blkno;
response.r_hdr.h_generation = hdr->h_generation;
response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
response.r_response = cpu_to_be32(vote_response);
net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
osb->net_key,
&response,
sizeof(struct ocfs2_response_msg),
node_num,
NULL);
/* We still want to error print for ENOPROTOOPT here. The
* sending node shouldn't have unregistered his net handler
* without sending an unmount vote 1st */
if (net_status < 0
&& net_status != -ETIMEDOUT
&& net_status != -ENOTCONN)
mlog(ML_ERROR, "message to node %u fails with error %d!\n",
node_num, net_status);
}
static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
{
unsigned long processed;
struct ocfs2_lock_res *lockres;
struct ocfs2_vote_work *work;
mlog_entry_void();
spin_lock(&osb->vote_task_lock);
/* grab this early so we know to try again if a state change and
* wake happens part-way through our work */
osb->vote_work_sequence = osb->vote_wake_sequence;
processed = osb->blocked_lock_count;
while (processed) {
BUG_ON(list_empty(&osb->blocked_lock_list));
lockres = list_entry(osb->blocked_lock_list.next,
struct ocfs2_lock_res, l_blocked_list);
list_del_init(&lockres->l_blocked_list);
osb->blocked_lock_count--;
spin_unlock(&osb->vote_task_lock);
BUG_ON(!processed);
processed--;
ocfs2_process_blocked_lock(osb, lockres);
spin_lock(&osb->vote_task_lock);
}
while (osb->vote_count) {
BUG_ON(list_empty(&osb->vote_list));
work = list_entry(osb->vote_list.next,
struct ocfs2_vote_work, w_list);
list_del(&work->w_list);
osb->vote_count--;
spin_unlock(&osb->vote_task_lock);
ocfs2_process_vote(osb, &work->w_msg);
kfree(work);
spin_lock(&osb->vote_task_lock);
}
spin_unlock(&osb->vote_task_lock);
mlog_exit_void();
}
static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
{
int empty = 0;
spin_lock(&osb->vote_task_lock);
if (list_empty(&osb->blocked_lock_list) &&
list_empty(&osb->vote_list))
empty = 1;
spin_unlock(&osb->vote_task_lock);
return empty;
}
static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
{
int should_wake = 0;
spin_lock(&osb->vote_task_lock);
if (osb->vote_work_sequence != osb->vote_wake_sequence)
should_wake = 1;
spin_unlock(&osb->vote_task_lock);
return should_wake;
}
int ocfs2_vote_thread(void *arg)
{
int status = 0;
struct ocfs2_super *osb = arg;
/* only quit once we've been asked to stop and there is no more
* work available */
while (!(kthread_should_stop() &&
ocfs2_vote_thread_lists_empty(osb))) {
wait_event_interruptible(osb->vote_event,
ocfs2_vote_thread_should_wake(osb) ||
kthread_should_stop());
mlog(0, "vote_thread: awoken\n");
ocfs2_vote_thread_do_work(osb);
}
osb->vote_task = NULL;
return status;
}
static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
{
struct ocfs2_net_wait_ctxt *w;
w = kzalloc(sizeof(*w), GFP_NOFS);
if (!w) {
mlog_errno(-ENOMEM);
goto bail;
}
INIT_LIST_HEAD(&w->n_list);
init_waitqueue_head(&w->n_event);
ocfs2_node_map_init(&w->n_node_map);
w->n_response_id = response_id;
w->n_callback = NULL;
bail:
return w;
}
static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
{
unsigned int ret;
spin_lock(&osb->net_response_lock);
ret = ++osb->net_response_ids;
spin_unlock(&osb->net_response_lock);
return ret;
}
static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
struct ocfs2_net_wait_ctxt *w)
{
spin_lock(&osb->net_response_lock);
list_del(&w->n_list);
spin_unlock(&osb->net_response_lock);
}
static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
struct ocfs2_net_wait_ctxt *w)
{
spin_lock(&osb->net_response_lock);
list_add_tail(&w->n_list,
&osb->net_response_list);
spin_unlock(&osb->net_response_lock);
}
static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
struct ocfs2_net_wait_ctxt *w,
int node_num)
{
assert_spin_locked(&osb->net_response_lock);
ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
wake_up(&w->n_event);
}
/* Intended to be called from the node down callback, we fake remove
* the node from all our response contexts */
void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
int node_num)
{
struct list_head *p;
struct ocfs2_net_wait_ctxt *w = NULL;
spin_lock(&osb->net_response_lock);
list_for_each(p, &osb->net_response_list) {
w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
__ocfs2_mark_node_responded(osb, w, node_num);
}
spin_unlock(&osb->net_response_lock);
}
static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
struct ocfs2_vote_msg *request,
unsigned int response_id,
int *response,
struct ocfs2_net_response_cb *callback)
{
int status, i, remote_err;
struct ocfs2_net_wait_ctxt *w = NULL;
int dequeued = 0;
mlog_entry_void();
w = ocfs2_new_net_wait_ctxt(response_id);
if (!w) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
w->n_callback = callback;
/* we're pretty much ready to go at this point, and this fills
* in n_response which we need anyway... */
ocfs2_queue_net_wait_ctxt(osb, w);
i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
while (i != O2NM_INVALID_NODE_NUM) {
if (i != osb->node_num) {
mlog(0, "trying to send request to node %i\n", i);
ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
remote_err = 0;
status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
osb->net_key,
request,
sizeof(*request),
i,
&remote_err);
if (status == -ETIMEDOUT) {
mlog(0, "remote node %d timed out!\n", i);
status = -EAGAIN;
goto bail;
}
if (remote_err < 0) {
status = remote_err;
mlog(0, "remote error %d on node %d!\n",
remote_err, i);
mlog_errno(status);
goto bail;
}
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
i++;
i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
mlog(0, "next is %d, i am %d\n", i, osb->node_num);
}
mlog(0, "done sending, now waiting on responses...\n");
wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
ocfs2_dequeue_net_wait_ctxt(osb, w);
dequeued = 1;
*response = w->n_response;
status = 0;
bail:
if (w) {
if (!dequeued)
ocfs2_dequeue_net_wait_ctxt(osb, w);
kfree(w);
}
mlog_exit(status);
return status;
}
static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
u64 blkno,
unsigned int generation,
enum ocfs2_vote_request type)
{
struct ocfs2_vote_msg *request;
struct ocfs2_msg_hdr *hdr;
BUG_ON(!ocfs2_is_valid_vote_request(type));
request = kzalloc(sizeof(*request), GFP_NOFS);
if (!request) {
mlog_errno(-ENOMEM);
} else {
hdr = &request->v_hdr;
hdr->h_node_num = cpu_to_be32(osb->node_num);
hdr->h_request = cpu_to_be32(type);
hdr->h_blkno = cpu_to_be64(blkno);
hdr->h_generation = cpu_to_be32(generation);
}
return request;
}
/* Complete the buildup of a new vote request and process the
* broadcast return value. */
static int ocfs2_do_request_vote(struct ocfs2_super *osb,
struct ocfs2_vote_msg *request,
struct ocfs2_net_response_cb *callback)
{
int status, response = -EBUSY;
unsigned int response_id;
struct ocfs2_msg_hdr *hdr;
response_id = ocfs2_new_response_id(osb);
hdr = &request->v_hdr;
hdr->h_response_id = cpu_to_be32(response_id);
status = ocfs2_broadcast_vote(osb, request, response_id, &response,
callback);
if (status < 0) {
mlog_errno(status);
goto bail;
}
status = response;
bail:
return status;
}
int ocfs2_request_mount_vote(struct ocfs2_super *osb)
{
int status;
struct ocfs2_vote_msg *request = NULL;
request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
if (!request) {
status = -ENOMEM;
goto bail;
}
status = -EAGAIN;
while (status == -EAGAIN) {
if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
signal_pending(current)) {
status = -ERESTARTSYS;
goto bail;
}
if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
osb->node_num)) {
status = 0;
goto bail;
}
status = ocfs2_do_request_vote(osb, request, NULL);
}
bail:
kfree(request);
return status;
}
int ocfs2_request_umount_vote(struct ocfs2_super *osb)
{
int status;
struct ocfs2_vote_msg *request = NULL;
request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
if (!request) {
status = -ENOMEM;
goto bail;
}
status = -EAGAIN;
while (status == -EAGAIN) {
/* Do not check signals on this vote... We really want
* this one to go all the way through. */
if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
osb->node_num)) {
status = 0;
goto bail;
}
status = ocfs2_do_request_vote(osb, request, NULL);
}
bail:
kfree(request);
return status;
}
/* TODO: This should eventually be a hash table! */
static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
u32 response_id)
{
struct list_head *p;
struct ocfs2_net_wait_ctxt *w = NULL;
list_for_each(p, &osb->net_response_list) {
w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
if (response_id == w->n_response_id)
break;
w = NULL;
}
return w;
}
/* Translate response codes into local node errno values */
static inline int ocfs2_translate_response(int response)
{
int ret;
switch (response) {
case OCFS2_RESPONSE_OK:
ret = 0;
break;
case OCFS2_RESPONSE_BUSY:
ret = -EBUSY;
break;
default:
ret = -EINVAL;
}
return ret;
}
static int ocfs2_handle_response_message(struct o2net_msg *msg,
u32 len,
void *data, void **ret_data)
{
unsigned int response_id, node_num;
int response_status;
struct ocfs2_super *osb = data;
struct ocfs2_response_msg *resp;
struct ocfs2_net_wait_ctxt * w;
struct ocfs2_net_response_cb *resp_cb;
resp = (struct ocfs2_response_msg *) msg->buf;
response_id = be32_to_cpu(resp->r_hdr.h_response_id);
node_num = be32_to_cpu(resp->r_hdr.h_node_num);
response_status =
ocfs2_translate_response(be32_to_cpu(resp->r_response));
mlog(0, "received response message:\n");
mlog(0, "h_response_id = %u\n", response_id);
mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
mlog(0, "h_blkno = %llu\n",
(unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
mlog(0, "h_node_num = %u\n", node_num);
mlog(0, "r_response = %d\n", response_status);
spin_lock(&osb->net_response_lock);
w = __ocfs2_find_net_wait_ctxt(osb, response_id);
if (!w) {
mlog(0, "request not found!\n");
goto bail;
}
resp_cb = w->n_callback;
if (response_status && (!w->n_response)) {
/* we only really need one negative response so don't
* set it twice. */
w->n_response = response_status;
}
if (resp_cb) {
spin_unlock(&osb->net_response_lock);
resp_cb->rc_cb(resp_cb->rc_priv, resp);
spin_lock(&osb->net_response_lock);
}
__ocfs2_mark_node_responded(osb, w, node_num);
bail:
spin_unlock(&osb->net_response_lock);
return 0;
}
static int ocfs2_handle_vote_message(struct o2net_msg *msg,
u32 len,
void *data, void **ret_data)
{
int status;
struct ocfs2_super *osb = data;
struct ocfs2_vote_work *work;
work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
if (!work) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
INIT_LIST_HEAD(&work->w_list);
memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
mlog(0, "scheduling vote request:\n");
mlog(0, "h_response_id = %u\n",
be32_to_cpu(work->w_msg.v_hdr.h_response_id));
mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
mlog(0, "h_blkno = %llu\n",
(unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
mlog(0, "h_generation = %u\n",
be32_to_cpu(work->w_msg.v_hdr.h_generation));
mlog(0, "h_node_num = %u\n",
be32_to_cpu(work->w_msg.v_hdr.h_node_num));
spin_lock(&osb->vote_task_lock);
list_add_tail(&work->w_list, &osb->vote_list);
osb->vote_count++;
spin_unlock(&osb->vote_task_lock);
ocfs2_kick_vote_thread(osb);
status = 0;
bail:
return status;
}
void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
{
if (!osb->net_key)
return;
o2net_unregister_handler_list(&osb->osb_net_handlers);
if (!list_empty(&osb->net_response_list))
mlog(ML_ERROR, "net response list not empty!\n");
osb->net_key = 0;
}
int ocfs2_register_net_handlers(struct ocfs2_super *osb)
{
int status = 0;
if (ocfs2_mount_local(osb))
return 0;
status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
osb->net_key,
sizeof(struct ocfs2_response_msg),
ocfs2_handle_response_message,
osb, NULL, &osb->osb_net_handlers);
if (status) {
mlog_errno(status);
goto bail;
}
status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
osb->net_key,
sizeof(struct ocfs2_vote_msg),
ocfs2_handle_vote_message,
osb, NULL, &osb->osb_net_handlers);
if (status) {
mlog_errno(status);
goto bail;
}
bail:
if (status < 0)
ocfs2_unregister_net_handlers(osb);
return status;
}

View file

@ -49,6 +49,7 @@ header-y += comstats.h
header-y += const.h
header-y += cgroupstats.h
header-y += cycx_cfm.h
header-y += dlmconstants.h
header-y += dlm_device.h
header-y += dlm_netlink.h
header-y += dm-ioctl.h

View file

@ -19,148 +19,12 @@
* routines and structures to use DLM lockspaces
*/
/*
* Lock Modes
*/
/* Lock levels and flags are here */
#include <linux/dlmconstants.h>
#define DLM_LOCK_IV -1 /* invalid */
#define DLM_LOCK_NL 0 /* null */
#define DLM_LOCK_CR 1 /* concurrent read */
#define DLM_LOCK_CW 2 /* concurrent write */
#define DLM_LOCK_PR 3 /* protected read */
#define DLM_LOCK_PW 4 /* protected write */
#define DLM_LOCK_EX 5 /* exclusive */
/*
* Maximum size in bytes of a dlm_lock name
*/
#define DLM_RESNAME_MAXLEN 64
/*
* Flags to dlm_lock
*
* DLM_LKF_NOQUEUE
*
* Do not queue the lock request on the wait queue if it cannot be granted
* immediately. If the lock cannot be granted because of this flag, DLM will
* either return -EAGAIN from the dlm_lock call or will return 0 from
* dlm_lock and -EAGAIN in the lock status block when the AST is executed.
*
* DLM_LKF_CANCEL
*
* Used to cancel a pending lock request or conversion. A converting lock is
* returned to its previously granted mode.
*
* DLM_LKF_CONVERT
*
* Indicates a lock conversion request. For conversions the name and namelen
* are ignored and the lock ID in the LKSB is used to identify the lock.
*
* DLM_LKF_VALBLK
*
* Requests DLM to return the current contents of the lock value block in the
* lock status block. When this flag is set in a lock conversion from PW or EX
* modes, DLM assigns the value specified in the lock status block to the lock
* value block of the lock resource. The LVB is a DLM_LVB_LEN size array
* containing application-specific information.
*
* DLM_LKF_QUECVT
*
* Force a conversion request to be queued, even if it is compatible with
* the granted modes of other locks on the same resource.
*
* DLM_LKF_IVVALBLK
*
* Invalidate the lock value block.
*
* DLM_LKF_CONVDEADLK
*
* Allows the dlm to resolve conversion deadlocks internally by demoting the
* granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
* returned for a conversion that's been effected by this.
*
* DLM_LKF_PERSISTENT
*
* Only relevant to locks originating in userspace. A persistent lock will not
* be removed if the process holding the lock exits.
*
* DLM_LKF_NODLCKWT
*
* Do not cancel the lock if it gets into conversion deadlock.
* Exclude this lock from being monitored due to DLM_LSFL_TIMEWARN.
*
* DLM_LKF_NODLCKBLK
*
* net yet implemented
*
* DLM_LKF_EXPEDITE
*
* Used only with new requests for NL mode locks. Tells the lock manager
* to grant the lock, ignoring other locks in convert and wait queues.
*
* DLM_LKF_NOQUEUEBAST
*
* Send blocking AST's before returning -EAGAIN to the caller. It is only
* used along with the NOQUEUE flag. Blocking AST's are not sent for failed
* NOQUEUE requests otherwise.
*
* DLM_LKF_HEADQUE
*
* Add a lock to the head of the convert or wait queue rather than the tail.
*
* DLM_LKF_NOORDER
*
* Disregard the standard grant order rules and grant a lock as soon as it
* is compatible with other granted locks.
*
* DLM_LKF_ORPHAN
*
* not yet implemented
*
* DLM_LKF_ALTPR
*
* If the requested mode cannot be granted immediately, try to grant the lock
* in PR mode instead. If this alternate mode is granted instead of the
* requested mode, DLM_SBF_ALTMODE is returned in the lksb.
*
* DLM_LKF_ALTCW
*
* The same as ALTPR, but the alternate mode is CW.
*
* DLM_LKF_FORCEUNLOCK
*
* Unlock the lock even if it is converting or waiting or has sublocks.
* Only really for use by the userland device.c code.
*
*/
#define DLM_LKF_NOQUEUE 0x00000001
#define DLM_LKF_CANCEL 0x00000002
#define DLM_LKF_CONVERT 0x00000004
#define DLM_LKF_VALBLK 0x00000008
#define DLM_LKF_QUECVT 0x00000010
#define DLM_LKF_IVVALBLK 0x00000020
#define DLM_LKF_CONVDEADLK 0x00000040
#define DLM_LKF_PERSISTENT 0x00000080
#define DLM_LKF_NODLCKWT 0x00000100
#define DLM_LKF_NODLCKBLK 0x00000200
#define DLM_LKF_EXPEDITE 0x00000400
#define DLM_LKF_NOQUEUEBAST 0x00000800
#define DLM_LKF_HEADQUE 0x00001000
#define DLM_LKF_NOORDER 0x00002000
#define DLM_LKF_ORPHAN 0x00004000
#define DLM_LKF_ALTPR 0x00008000
#define DLM_LKF_ALTCW 0x00010000
#define DLM_LKF_FORCEUNLOCK 0x00020000
#define DLM_LKF_TIMEOUT 0x00040000
/*
* Some return codes that are not in errno.h
*/
#define DLM_ECANCEL 0x10001
#define DLM_EUNLOCK 0x10002
typedef void dlm_lockspace_t;

View file

@ -0,0 +1,159 @@
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __DLMCONSTANTS_DOT_H__
#define __DLMCONSTANTS_DOT_H__
/*
* Constants used by DLM interface.
*/
/*
* Lock Modes
*/
#define DLM_LOCK_IV (-1) /* invalid */
#define DLM_LOCK_NL 0 /* null */
#define DLM_LOCK_CR 1 /* concurrent read */
#define DLM_LOCK_CW 2 /* concurrent write */
#define DLM_LOCK_PR 3 /* protected read */
#define DLM_LOCK_PW 4 /* protected write */
#define DLM_LOCK_EX 5 /* exclusive */
/*
* Flags to dlm_lock
*
* DLM_LKF_NOQUEUE
*
* Do not queue the lock request on the wait queue if it cannot be granted
* immediately. If the lock cannot be granted because of this flag, DLM will
* either return -EAGAIN from the dlm_lock call or will return 0 from
* dlm_lock and -EAGAIN in the lock status block when the AST is executed.
*
* DLM_LKF_CANCEL
*
* Used to cancel a pending lock request or conversion. A converting lock is
* returned to its previously granted mode.
*
* DLM_LKF_CONVERT
*
* Indicates a lock conversion request. For conversions the name and namelen
* are ignored and the lock ID in the LKSB is used to identify the lock.
*
* DLM_LKF_VALBLK
*
* Requests DLM to return the current contents of the lock value block in the
* lock status block. When this flag is set in a lock conversion from PW or EX
* modes, DLM assigns the value specified in the lock status block to the lock
* value block of the lock resource. The LVB is a DLM_LVB_LEN size array
* containing application-specific information.
*
* DLM_LKF_QUECVT
*
* Force a conversion request to be queued, even if it is compatible with
* the granted modes of other locks on the same resource.
*
* DLM_LKF_IVVALBLK
*
* Invalidate the lock value block.
*
* DLM_LKF_CONVDEADLK
*
* Allows the dlm to resolve conversion deadlocks internally by demoting the
* granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
* returned for a conversion that's been effected by this.
*
* DLM_LKF_PERSISTENT
*
* Only relevant to locks originating in userspace. A persistent lock will not
* be removed if the process holding the lock exits.
*
* DLM_LKF_NODLCKWT
*
* Do not cancel the lock if it gets into conversion deadlock.
* Exclude this lock from being monitored due to DLM_LSFL_TIMEWARN.
*
* DLM_LKF_NODLCKBLK
*
* net yet implemented
*
* DLM_LKF_EXPEDITE
*
* Used only with new requests for NL mode locks. Tells the lock manager
* to grant the lock, ignoring other locks in convert and wait queues.
*
* DLM_LKF_NOQUEUEBAST
*
* Send blocking AST's before returning -EAGAIN to the caller. It is only
* used along with the NOQUEUE flag. Blocking AST's are not sent for failed
* NOQUEUE requests otherwise.
*
* DLM_LKF_HEADQUE
*
* Add a lock to the head of the convert or wait queue rather than the tail.
*
* DLM_LKF_NOORDER
*
* Disregard the standard grant order rules and grant a lock as soon as it
* is compatible with other granted locks.
*
* DLM_LKF_ORPHAN
*
* not yet implemented
*
* DLM_LKF_ALTPR
*
* If the requested mode cannot be granted immediately, try to grant the lock
* in PR mode instead. If this alternate mode is granted instead of the
* requested mode, DLM_SBF_ALTMODE is returned in the lksb.
*
* DLM_LKF_ALTCW
*
* The same as ALTPR, but the alternate mode is CW.
*
* DLM_LKF_FORCEUNLOCK
*
* Unlock the lock even if it is converting or waiting or has sublocks.
* Only really for use by the userland device.c code.
*
*/
#define DLM_LKF_NOQUEUE 0x00000001
#define DLM_LKF_CANCEL 0x00000002
#define DLM_LKF_CONVERT 0x00000004
#define DLM_LKF_VALBLK 0x00000008
#define DLM_LKF_QUECVT 0x00000010
#define DLM_LKF_IVVALBLK 0x00000020
#define DLM_LKF_CONVDEADLK 0x00000040
#define DLM_LKF_PERSISTENT 0x00000080
#define DLM_LKF_NODLCKWT 0x00000100
#define DLM_LKF_NODLCKBLK 0x00000200
#define DLM_LKF_EXPEDITE 0x00000400
#define DLM_LKF_NOQUEUEBAST 0x00000800
#define DLM_LKF_HEADQUE 0x00001000
#define DLM_LKF_NOORDER 0x00002000
#define DLM_LKF_ORPHAN 0x00004000
#define DLM_LKF_ALTPR 0x00008000
#define DLM_LKF_ALTCW 0x00010000
#define DLM_LKF_FORCEUNLOCK 0x00020000
#define DLM_LKF_TIMEOUT 0x00040000
/*
* Some return codes that are not in errno.h
*/
#define DLM_ECANCEL 0x10001
#define DLM_EUNLOCK 0x10002
#endif /* __DLMCONSTANTS_DOT_H__ */