Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2

* 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2: (28 commits)
  Ocfs2: Teach local-mounted ocfs2 to handle unwritten_extents correctly.
  ocfs2/dlm: Do not migrate resource to a node that is leaving the domain
  ocfs2/dlm: Add new dlm message DLM_BEGIN_EXIT_DOMAIN_MSG
  Ocfs2/move_extents: Set several trivial constraints for threshold.
  Ocfs2/move_extents: Let defrag handle partial extent moving.
  Ocfs2/move_extents: move/defrag extents within a certain range.
  Ocfs2/move_extents: helper to calculate the defraging length in one run.
  Ocfs2/move_extents: move entire/partial extent.
  Ocfs2/move_extents: helpers to update the group descriptor and global bitmap inode.
  Ocfs2/move_extents: helper to probe a proper region to move in an alloc group.
  Ocfs2/move_extents: helper to validate and adjust moving goal.
  Ocfs2/move_extents: find the victim alloc group, where the given #blk fits.
  Ocfs2/move_extents: defrag a range of extent.
  Ocfs2/move_extents: move a range of extent.
  Ocfs2/move_extents: lock allocators and reserve metadata blocks and data clusters for extents moving.
  Ocfs2/move_extents: Add basic framework and source files for extent moving.
  Ocfs2/move_extents: Adding new ioctl code 'OCFS2_IOC_MOVE_EXT' to ocfs2.
  Ocfs2/refcounttree: Publicize couple of funcs from refcounttree.c
  Ocfs2: Add a new code 'OCFS2_INFO_FREEFRAG' for o2info ioctl.
  Ocfs2: Add a new code 'OCFS2_INFO_FREEINODE' for o2info ioctl.
  ...
This commit is contained in:
Linus Torvalds 2011-05-26 10:55:15 -07:00
commit a74b81b0af
22 changed files with 2147 additions and 263 deletions

View file

@ -1,11 +1,10 @@
What: /sys/o2cb symlink
Date: Dec 2005
KernelVersion: 2.6.16
Date: May 2011
KernelVersion: 2.6.40
Contact: ocfs2-devel@oss.oracle.com
Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
be removed when new versions of ocfs2-tools which know to look
Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is
removed when new versions of ocfs2-tools which know to look
in /sys/fs/o2cb are sufficiently prevalent. Don't code new
software to look here, it should try /sys/fs/o2cb instead.
See Documentation/ABI/stable/o2cb for more information on usage.
Users: ocfs2-tools. It's sufficient to mail proposed changes to
ocfs2-devel@oss.oracle.com.

View file

@ -262,16 +262,6 @@ Who: Michael Buesch <mb@bu3sch.de>
---------------------------
What: /sys/o2cb symlink
When: January 2010
Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb
exists as a symlink for backwards compatibility for old versions of
ocfs2-tools. 2 years should be sufficient time to phase in new versions
which know to look in /sys/fs/o2cb.
Who: ocfs2-devel@oss.oracle.com
---------------------------
What: Ability for non root users to shm_get hugetlb pages based on mlock
resource limits
When: 2.6.31

View file

@ -46,9 +46,15 @@ errors=panic Panic and halt the machine if an error occurs.
intr (*) Allow signals to interrupt cluster operations.
nointr Do not allow signals to interrupt cluster
operations.
noatime Do not update access time.
relatime(*) Update atime if the previous atime is older than
mtime or ctime
strictatime Always update atime, but the minimum update interval
is specified by atime_quantum.
atime_quantum=60(*) OCFS2 will not update atime unless this number
of seconds has passed since the last update.
Set to zero to always update atime.
Set to zero to always update atime. This option need
work with strictatime.
data=ordered (*) All data are forced directly out to the main file
system prior to its metadata being committed to the
journal.

View file

@ -30,6 +30,7 @@ ocfs2-objs := \
namei.o \
refcounttree.o \
reservations.o \
move_extents.o \
resize.o \
slot_map.o \
suballoc.o \

View file

@ -29,6 +29,7 @@
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
#include <cluster/masklog.h>
@ -7184,3 +7185,168 @@ out_commit:
out:
return ret;
}
static int ocfs2_trim_extent(struct super_block *sb,
struct ocfs2_group_desc *gd,
u32 start, u32 count)
{
u64 discard, bcount;
bcount = ocfs2_clusters_to_blocks(sb, count);
discard = le64_to_cpu(gd->bg_blkno) +
ocfs2_clusters_to_blocks(sb, start);
trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
}
static int ocfs2_trim_group(struct super_block *sb,
struct ocfs2_group_desc *gd,
u32 start, u32 max, u32 minbits)
{
int ret = 0, count = 0, next;
void *bitmap = gd->bg_bitmap;
if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
return 0;
trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
start, max, minbits);
while (start < max) {
start = ocfs2_find_next_zero_bit(bitmap, max, start);
if (start >= max)
break;
next = ocfs2_find_next_bit(bitmap, max, start);
if ((next - start) >= minbits) {
ret = ocfs2_trim_extent(sb, gd,
start, next - start);
if (ret < 0) {
mlog_errno(ret);
break;
}
count += next - start;
}
start = next + 1;
if (fatal_signal_pending(current)) {
count = -ERESTARTSYS;
break;
}
if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
break;
}
if (ret < 0)
count = ret;
return count;
}
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
u64 start, len, trimmed, first_group, last_group, group;
int ret, cnt;
u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode = NULL;
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
minlen = range->minlen >> osb->s_clustersize_bits;
trimmed = 0;
if (!len) {
range->len = 0;
return 0;
}
if (minlen >= osb->bitmap_cpg)
return -EINVAL;
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (!main_bm_inode) {
ret = -EIO;
mlog_errno(ret);
goto out;
}
mutex_lock(&main_bm_inode->i_mutex);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
if (ret < 0) {
mlog_errno(ret);
goto out_mutex;
}
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
if (start >= le32_to_cpu(main_bm->i_clusters)) {
ret = -EINVAL;
goto out_unlock;
}
if (start + len > le32_to_cpu(main_bm->i_clusters))
len = le32_to_cpu(main_bm->i_clusters) - start;
trace_ocfs2_trim_fs(start, len, minlen);
/* Determine first and last group to examine based on start and len */
first_group = ocfs2_which_cluster_group(main_bm_inode, start);
if (first_group == osb->first_cluster_group_blkno)
first_bit = start;
else
first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
last_bit = osb->bitmap_cpg;
for (group = first_group; group <= last_group;) {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
else
last_bit = first_bit + len;
ret = ocfs2_read_group_descriptor(main_bm_inode,
main_bm, group,
&gd_bh);
if (ret < 0) {
mlog_errno(ret);
break;
}
gd = (struct ocfs2_group_desc *)gd_bh->b_data;
cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
brelse(gd_bh);
gd_bh = NULL;
if (cnt < 0) {
ret = cnt;
mlog_errno(ret);
break;
}
trimmed += cnt;
len -= osb->bitmap_cpg - first_bit;
first_bit = 0;
if (group == osb->first_cluster_group_blkno)
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
}
range->len = trimmed * sb->s_blocksize;
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
iput(main_bm_inode);
out:
return ret;
}

View file

@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
/*
* Helper function to look at the # of clusters in an extent record.
*/

View file

@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
void o2cb_sys_shutdown(void)
{
mlog_sys_shutdown();
sysfs_remove_link(NULL, "o2cb");
kset_unregister(o2cb_kset);
}
@ -69,14 +68,6 @@ int o2cb_sys_init(void)
if (!o2cb_kset)
return -ENOMEM;
/*
* Create this symlink for backwards compatibility with old
* versions of ocfs2-tools which look for things in /sys/o2cb.
*/
ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
if (ret)
goto error;
ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
if (ret)
goto error;

View file

@ -144,6 +144,7 @@ struct dlm_ctxt
wait_queue_head_t dlm_join_events;
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct dlm_recovery_ctxt reco;
spinlock_t master_lock;
@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
return 1;
}
static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
{
if (idx == DLM_GRANTED_LIST)
return "granted";
else if (idx == DLM_CONVERTING_LIST)
return "converting";
else if (idx == DLM_BLOCKED_LIST)
return "blocked";
else
return "unknown";
}
static inline struct list_head *
dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
{
@ -448,6 +461,7 @@ enum {
DLM_FINALIZE_RECO_MSG = 518,
DLM_QUERY_REGION = 519,
DLM_QUERY_NODEINFO = 520,
DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
};
struct dlm_reco_node_data

View file

@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
buf + out, len - out);
out += snprintf(buf + out, len - out, "\n");
/* Exit Domain Map: xx xx xx */
out += snprintf(buf + out, len - out, "Exit Domain Map: ");
out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
buf + out, len - out);
out += snprintf(buf + out, len - out, "\n");
/* Live Map: xx xx xx */
out += snprintf(buf + out, len - out, "Live Map: ");
out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,

View file

@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* New in version 1.1:
* - Message DLM_QUERY_REGION added to support global heartbeat
* - Message DLM_QUERY_NODEINFO added to allow online node removes
* New in version 1.2:
* - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
.pv_minor = 1,
.pv_minor = 2,
};
#define DLM_DOMAIN_BACKOFF_MS 200
@ -449,14 +451,18 @@ redo_bucket:
dropped = dlm_empty_lockres(dlm, res);
spin_lock(&res->spinlock);
__dlm_lockres_calc_usage(dlm, res);
iter = res->hash_node.next;
if (dropped)
__dlm_lockres_calc_usage(dlm, res);
else
iter = res->hash_node.next;
spin_unlock(&res->spinlock);
dlm_lockres_put(res);
if (dropped)
if (dropped) {
cond_resched_lock(&dlm->spinlock);
goto redo_bucket;
}
}
cond_resched_lock(&dlm->spinlock);
num += n;
@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
return ret;
}
static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
void *data, void **ret_data)
{
struct dlm_ctxt *dlm = data;
unsigned int node;
struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
if (!dlm_grab(dlm))
return 0;
node = exit_msg->node_idx;
mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
spin_lock(&dlm->spinlock);
set_bit(node, dlm->exit_domain_map);
spin_unlock(&dlm->spinlock);
dlm_put(dlm);
return 0;
}
static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
{
/* Yikes, a double spinlock! I need domain_lock for the dlm
@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
clear_bit(node, dlm->exit_domain_map);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
return 0;
}
static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
unsigned int node)
{
int status;
struct dlm_exit_domain leave_msg;
mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
node, dlm->name, dlm->node_num);
mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
msg_type, node);
memset(&leave_msg, 0, sizeof(leave_msg));
leave_msg.node_idx = dlm->node_num;
status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
&leave_msg, sizeof(leave_msg), node,
NULL);
status = o2net_send_message(msg_type, dlm->key, &leave_msg,
sizeof(leave_msg), node, NULL);
if (status < 0)
mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
"node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
mlog(0, "status return %d from o2net_send_message\n", status);
mlog(ML_ERROR, "Error %d sending domain exit message %u "
"to node %u on domain %s\n", status, msg_type, node,
dlm->name);
return status;
}
static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
{
int node = -1;
/* Support for begin exit domain was added in 1.2 */
if (dlm->dlm_locking_proto.pv_major == 1 &&
dlm->dlm_locking_proto.pv_minor < 2)
return;
/*
* Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
* informational. Meaning if a node does not receive the message,
* so be it.
*/
spin_lock(&dlm->spinlock);
while (1) {
node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
if (node >= O2NM_MAX_NODES)
break;
if (node == dlm->node_num)
continue;
spin_unlock(&dlm->spinlock);
dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
spin_lock(&dlm->spinlock);
}
spin_unlock(&dlm->spinlock);
}
static void dlm_leave_domain(struct dlm_ctxt *dlm)
{
@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
clear_node = 1;
status = dlm_send_one_domain_exit(dlm, node);
status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
node);
if (status < 0 &&
status != -ENOPROTOOPT &&
status != -ENOTCONN) {
@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
if (leave) {
mlog(0, "shutting down domain %s\n", dlm->name);
dlm_begin_exit_domain(dlm);
/* We changed dlm state, notify the thread */
dlm_kick_thread(dlm, NULL);
@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
* leftover join state. */
BUG_ON(dlm->joining_node != assert->node_idx);
set_bit(assert->node_idx, dlm->domain_map);
clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
if (status)
goto bail;
status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
sizeof(struct dlm_exit_domain),
dlm_begin_exit_domain_handler,
dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
bail:
if (status)
dlm_unregister_domain_handlers(dlm);

View file

@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
dlm_lockres_put(res);
}
/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
* if not. If 0, numlocks is set to the number of locks in the lockres.
/*
* A migrateable resource is one that is :
* 1. locally mastered, and,
* 2. zero local locks, and,
* 3. one or more non-local locks, or, one or more references
* Returns 1 if yes, 0 if not.
*/
static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
int *numlocks,
int *hasrefs)
struct dlm_lock_resource *res)
{
int ret;
int i;
int count = 0;
enum dlm_lockres_list idx;
int nonlocal = 0, node_ref;
struct list_head *queue;
struct dlm_lock *lock;
u64 cookie;
assert_spin_locked(&res->spinlock);
*numlocks = 0;
*hasrefs = 0;
if (res->owner != dlm->node_num)
return 0;
ret = -EINVAL;
if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
mlog(0, "cannot migrate lockres with unknown owner!\n");
goto leave;
}
if (res->owner != dlm->node_num) {
mlog(0, "cannot migrate lockres this node doesn't own!\n");
goto leave;
}
ret = 0;
queue = &res->granted;
for (i = 0; i < 3; i++) {
for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
++count;
if (lock->ml.node == dlm->node_num) {
mlog(0, "found a lock owned by this node still "
"on the %s queue! will not migrate this "
"lockres\n", (i == 0 ? "granted" :
(i == 1 ? "converting" :
"blocked")));
ret = -ENOTEMPTY;
goto leave;
if (lock->ml.node != dlm->node_num) {
nonlocal++;
continue;
}
cookie = be64_to_cpu(lock->ml.cookie);
mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
"%s list\n", dlm->name, res->lockname.len,
res->lockname.name,
dlm_get_lock_cookie_node(cookie),
dlm_get_lock_cookie_seq(cookie),
dlm_list_in_text(idx));
return 0;
}
queue++;
}
*numlocks = count;
if (!nonlocal) {
node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (node_ref >= O2NM_MAX_NODES)
return 0;
}
count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (count < O2NM_MAX_NODES)
*hasrefs = 1;
mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
res->lockname.name);
mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
leave:
return ret;
return 1;
}
/*
@ -2406,8 +2396,7 @@ leave:
static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
u8 target)
struct dlm_lock_resource *res, u8 target)
{
struct dlm_master_list_entry *mle = NULL;
struct dlm_master_list_entry *oldmle = NULL;
@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
const char *name;
unsigned int namelen;
int mle_added = 0;
int numlocks, hasrefs;
int wake = 0;
if (!dlm_grab(dlm))
return -EINVAL;
BUG_ON(target == O2NM_MAX_NODES);
name = res->lockname.name;
namelen = res->lockname.len;
mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
/*
* ensure this lockres is a proper candidate for migration
*/
spin_lock(&res->spinlock);
ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
if (ret < 0) {
spin_unlock(&res->spinlock);
goto leave;
}
spin_unlock(&res->spinlock);
/* no work to do */
if (numlocks == 0 && !hasrefs)
goto leave;
/*
* preallocate up front
* if this fails, abort
*/
mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
target);
/* preallocate up front. if this fails, abort */
ret = -ENOMEM;
mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
if (!mres) {
@ -2461,36 +2433,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
}
ret = 0;
/*
* find a node to migrate the lockres to
*/
spin_lock(&dlm->spinlock);
/* pick a new node */
if (!test_bit(target, dlm->domain_map) ||
target >= O2NM_MAX_NODES) {
target = dlm_pick_migration_target(dlm, res);
}
mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
namelen, name, target);
if (target >= O2NM_MAX_NODES ||
!test_bit(target, dlm->domain_map)) {
/* target chosen is not alive */
ret = -EINVAL;
}
if (ret) {
spin_unlock(&dlm->spinlock);
goto fail;
}
mlog(0, "continuing with target = %u\n", target);
/*
* clear any existing master requests and
* add the migration mle to the list
*/
spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
@ -2531,6 +2478,7 @@ fail:
dlm_put_mle(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
mle = NULL;
}
goto leave;
}
@ -2652,69 +2600,52 @@ leave:
if (wake)
wake_up(&res->wq);
/* TODO: cleanup */
if (mres)
free_page((unsigned long)mres);
dlm_put(dlm);
mlog(0, "returning %d\n", ret);
mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
name, target, ret);
return ret;
}
#define DLM_MIGRATION_RETRY_MS 100
/* Should be called only after beginning the domain leave process.
/*
* Should be called only after beginning the domain leave process.
* There should not be any remaining locks on nonlocal lock resources,
* and there should be no local locks left on locally mastered resources.
*
* Called with the dlm spinlock held, may drop it to do migration, but
* will re-acquire before exit.
*
* Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
* Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
*/
int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
int ret;
int lock_dropped = 0;
int numlocks, hasrefs;
u8 target = O2NM_MAX_NODES;
assert_spin_locked(&dlm->spinlock);
spin_lock(&res->spinlock);
if (res->owner != dlm->node_num) {
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "%s:%.*s: this node is not master, "
"trying to free this but locks remain\n",
dlm->name, res->lockname.len, res->lockname.name);
}
spin_unlock(&res->spinlock);
goto leave;
}
/* No need to migrate a lockres having no locks */
ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
if (ret >= 0 && numlocks == 0 && !hasrefs) {
spin_unlock(&res->spinlock);
goto leave;
}
if (dlm_is_lockres_migrateable(dlm, res))
target = dlm_pick_migration_target(dlm, res);
spin_unlock(&res->spinlock);
if (target == O2NM_MAX_NODES)
goto leave;
/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
spin_unlock(&dlm->spinlock);
lock_dropped = 1;
while (1) {
ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
if (ret >= 0)
break;
if (ret == -ENOTEMPTY) {
mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
res->lockname.len, res->lockname.name);
BUG();
}
mlog(0, "lockres %.*s: migrate failed, "
"retrying\n", res->lockname.len,
res->lockname.name);
msleep(DLM_MIGRATION_RETRY_MS);
}
ret = dlm_migrate_lockres(dlm, res, target);
if (ret)
mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
dlm->name, res->lockname.len, res->lockname.name,
target, ret);
spin_lock(&dlm->spinlock);
leave:
return lock_dropped;
@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
}
}
/* for now this is not too intelligent. we will
* need stats to make this do the right thing.
* this just finds the first lock on one of the
* queues and uses that node as the target. */
/*
* Pick a node to migrate the lock resource to. This function selects a
* potential target based first on the locks and then on refmap. It skips
* nodes that are in the process of exiting the domain.
*/
static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int i;
enum dlm_lockres_list idx;
struct list_head *queue = &res->granted;
struct dlm_lock *lock;
int nodenum;
int noderef;
u8 nodenum = O2NM_MAX_NODES;
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
spin_lock(&res->spinlock);
for (i=0; i<3; i++) {
/* Go through all the locks */
for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
/* up to the caller to make sure this node
* is alive */
if (lock->ml.node != dlm->node_num) {
spin_unlock(&res->spinlock);
return lock->ml.node;
}
if (lock->ml.node == dlm->node_num)
continue;
if (test_bit(lock->ml.node, dlm->exit_domain_map))
continue;
nodenum = lock->ml.node;
goto bail;
}
queue++;
}
nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (nodenum < O2NM_MAX_NODES) {
spin_unlock(&res->spinlock);
return nodenum;
}
spin_unlock(&res->spinlock);
mlog(0, "have not found a suitable target yet! checking domain map\n");
/* ok now we're getting desperate. pick anyone alive. */
nodenum = -1;
/* Go thru the refmap */
noderef = -1;
while (1) {
nodenum = find_next_bit(dlm->domain_map,
O2NM_MAX_NODES, nodenum+1);
mlog(0, "found %d in domain map\n", nodenum);
if (nodenum >= O2NM_MAX_NODES)
noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
noderef + 1);
if (noderef >= O2NM_MAX_NODES)
break;
if (nodenum != dlm->node_num) {
mlog(0, "picking %d\n", nodenum);
return nodenum;
}
if (noderef == dlm->node_num)
continue;
if (test_bit(noderef, dlm->exit_domain_map))
continue;
nodenum = noderef;
goto bail;
}
mlog(0, "giving up. no master to migrate to\n");
return DLM_LOCK_RES_OWNER_UNKNOWN;
bail:
return nodenum;
}
/* this is called by the new master once all lockres
* data has been received */
static int dlm_do_migrate_request(struct dlm_ctxt *dlm,

View file

@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
mlog(0, "node %u being removed from domain map!\n", idx);
clear_bit(idx, dlm->domain_map);
clear_bit(idx, dlm->exit_domain_map);
/* wake up migration waiters if a node goes down.
* perhaps later we can genericize this for other waiters. */
wake_up(&dlm->migration_wq);

View file

@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
* signifies a bast fired on the lock.
*/
#define DLMFS_CAPABILITIES "bast stackglue"
extern int param_set_dlmfs_capabilities(const char *val,
static int param_set_dlmfs_capabilities(const char *val,
struct kernel_param *kp)
{
printk(KERN_ERR "%s: readonly parameter\n", kp->name);

View file

@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
.splice_write = ocfs2_file_splice_write,
.fallocate = ocfs2_fallocate,
};
const struct file_operations ocfs2_dops_no_plocks = {

View file

@ -22,6 +22,11 @@
#include "ioctl.h"
#include "resize.h"
#include "refcounttree.h"
#include "sysfile.h"
#include "dir.h"
#include "buffer_head_io.h"
#include "suballoc.h"
#include "move_extents.h"
#include <linux/ext2_fs.h>
@ -35,31 +40,27 @@
* be -EFAULT. The error will be returned from the ioctl(2) call. It's
* just a best-effort to tell userspace that this request caused the error.
*/
static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
struct ocfs2_info_request __user *req)
{
kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
(void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
}
#define o2info_set_request_error(a, b) \
__o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
{
req->ir_flags |= OCFS2_INFO_FL_FILLED;
}
#define o2info_set_request_filled(a) \
__o2info_set_request_filled((struct ocfs2_info_request *)&(a))
static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
{
req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
}
#define o2info_clear_request_filled(a) \
__o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
static inline int o2info_coherent(struct ocfs2_info_request *req)
{
return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
}
static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
oib.ib_blocksize = inode->i_sb->s_blocksize;
o2info_set_request_filled(oib);
o2info_set_request_filled(&oib.ib_req);
if (o2info_to_user(oib, req))
goto bail;
@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oib, req);
o2info_set_request_error(&oib.ib_req, req);
return status;
}
@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
oic.ic_clustersize = osb->s_clustersize;
o2info_set_request_filled(oic);
o2info_set_request_filled(&oic.ic_req);
if (o2info_to_user(oic, req))
goto bail;
@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oic, req);
o2info_set_request_error(&oic.ic_req, req);
return status;
}
@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
oim.im_max_slots = osb->max_slots;
o2info_set_request_filled(oim);
o2info_set_request_filled(&oim.im_req);
if (o2info_to_user(oim, req))
goto bail;
@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oim, req);
o2info_set_request_error(&oim.im_req, req);
return status;
}
@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
o2info_set_request_filled(oil);
o2info_set_request_filled(&oil.il_req);
if (o2info_to_user(oil, req))
goto bail;
@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oil, req);
o2info_set_request_error(&oil.il_req, req);
return status;
}
@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
o2info_set_request_filled(oiu);
o2info_set_request_filled(&oiu.iu_req);
if (o2info_to_user(oiu, req))
goto bail;
@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oiu, req);
o2info_set_request_error(&oiu.iu_req, req);
return status;
}
@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
oif.if_incompat_features = osb->s_feature_incompat;
oif.if_ro_compat_features = osb->s_feature_ro_compat;
o2info_set_request_filled(oif);
o2info_set_request_filled(&oif.if_req);
if (o2info_to_user(oif, req))
goto bail;
@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oif, req);
o2info_set_request_error(&oif.if_req, req);
return status;
}
@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
oij.ij_journal_size = osb->journal->j_inode->i_size;
o2info_set_request_filled(oij);
o2info_set_request_filled(&oij.ij_req);
if (o2info_to_user(oij, req))
goto bail;
@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oij, req);
o2info_set_request_error(&oij.ij_req, req);
return status;
}
int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
struct inode *inode_alloc, u64 blkno,
struct ocfs2_info_freeinode *fi, u32 slot)
{
int status = 0, unlock = 0;
struct buffer_head *bh = NULL;
struct ocfs2_dinode *dinode_alloc = NULL;
if (inode_alloc)
mutex_lock(&inode_alloc->i_mutex);
if (o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
}
unlock = 1;
} else {
status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
fi->ifi_stat[slot].lfi_total =
le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
fi->ifi_stat[slot].lfi_free =
le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
bail:
if (unlock)
ocfs2_inode_unlock(inode_alloc, 0);
if (inode_alloc)
mutex_unlock(&inode_alloc->i_mutex);
brelse(bh);
return status;
}
int ocfs2_info_handle_freeinode(struct inode *inode,
struct ocfs2_info_request __user *req)
{
u32 i;
u64 blkno = -1;
char namebuf[40];
int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
struct ocfs2_info_freeinode *oifi = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *inode_alloc = NULL;
oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
if (!oifi) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
if (o2info_from_user(*oifi, req))
goto bail;
oifi->ifi_slotnum = osb->max_slots;
for (i = 0; i < oifi->ifi_slotnum; i++) {
if (o2info_coherent(&oifi->ifi_req)) {
inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
if (!inode_alloc) {
mlog(ML_ERROR, "unable to get alloc inode in "
"slot %u\n", i);
status = -EIO;
goto bail;
}
} else {
ocfs2_sprintf_system_inode_name(namebuf,
sizeof(namebuf),
type, i);
status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
namebuf,
strlen(namebuf),
&blkno);
if (status < 0) {
status = -ENOENT;
goto bail;
}
}
status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
if (status < 0)
goto bail;
iput(inode_alloc);
inode_alloc = NULL;
}
o2info_set_request_filled(&oifi->ifi_req);
if (o2info_to_user(*oifi, req))
goto bail;
status = 0;
bail:
if (status)
o2info_set_request_error(&oifi->ifi_req, req);
kfree(oifi);
return status;
}
static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
unsigned int chunksize)
{
int index;
index = __ilog2_u32(chunksize);
if (index >= OCFS2_INFO_MAX_HIST)
index = OCFS2_INFO_MAX_HIST - 1;
hist->fc_chunks[index]++;
hist->fc_clusters[index] += chunksize;
}
static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
unsigned int chunksize)
{
if (chunksize > stats->ffs_max)
stats->ffs_max = chunksize;
if (chunksize < stats->ffs_min)
stats->ffs_min = chunksize;
stats->ffs_avg += chunksize;
stats->ffs_free_chunks_real++;
}
void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
unsigned int chunksize)
{
o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
}
int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
struct inode *gb_inode,
struct ocfs2_dinode *gb_dinode,
struct ocfs2_chain_rec *rec,
struct ocfs2_info_freefrag *ffg,
u32 chunks_in_group)
{
int status = 0, used;
u64 blkno;
struct buffer_head *bh = NULL;
struct ocfs2_group_desc *bg = NULL;
unsigned int max_bits, num_clusters;
unsigned int offset = 0, cluster, chunk;
unsigned int chunk_free, last_chunksize = 0;
if (!le32_to_cpu(rec->c_free))
goto bail;
do {
if (!bg)
blkno = le64_to_cpu(rec->c_blkno);
else
blkno = le64_to_cpu(bg->bg_next_group);
if (bh) {
brelse(bh);
bh = NULL;
}
if (o2info_coherent(&ffg->iff_req))
status = ocfs2_read_group_descriptor(gb_inode,
gb_dinode,
blkno, &bh);
else
status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
if (status < 0) {
mlog(ML_ERROR, "Can't read the group descriptor # "
"%llu from device.", (unsigned long long)blkno);
status = -EIO;
goto bail;
}
bg = (struct ocfs2_group_desc *)bh->b_data;
if (!le16_to_cpu(bg->bg_free_bits_count))
continue;
max_bits = le16_to_cpu(bg->bg_bits);
offset = 0;
for (chunk = 0; chunk < chunks_in_group; chunk++) {
/*
* last chunk may be not an entire one.
*/
if ((offset + ffg->iff_chunksize) > max_bits)
num_clusters = max_bits - offset;
else
num_clusters = ffg->iff_chunksize;
chunk_free = 0;
for (cluster = 0; cluster < num_clusters; cluster++) {
used = ocfs2_test_bit(offset,
(unsigned long *)bg->bg_bitmap);
/*
* - chunk_free counts free clusters in #N chunk.
* - last_chunksize records the size(in) clusters
* for the last real free chunk being counted.
*/
if (!used) {
last_chunksize++;
chunk_free++;
}
if (used && last_chunksize) {
ocfs2_info_update_ffg(ffg,
last_chunksize);
last_chunksize = 0;
}
offset++;
}
if (chunk_free == ffg->iff_chunksize)
ffg->iff_ffs.ffs_free_chunks++;
}
/*
* need to update the info for last free chunk.
*/
if (last_chunksize)
ocfs2_info_update_ffg(ffg, last_chunksize);
} while (le64_to_cpu(bg->bg_next_group));
bail:
brelse(bh);
return status;
}
int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
struct inode *gb_inode, u64 blkno,
struct ocfs2_info_freefrag *ffg)
{
u32 chunks_in_group;
int status = 0, unlock = 0, i;
struct buffer_head *bh = NULL;
struct ocfs2_chain_list *cl = NULL;
struct ocfs2_chain_rec *rec = NULL;
struct ocfs2_dinode *gb_dinode = NULL;
if (gb_inode)
mutex_lock(&gb_inode->i_mutex);
if (o2info_coherent(&ffg->iff_req)) {
status = ocfs2_inode_lock(gb_inode, &bh, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
}
unlock = 1;
} else {
status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
gb_dinode = (struct ocfs2_dinode *)bh->b_data;
cl = &(gb_dinode->id2.i_chain);
/*
* Chunksize(in) clusters from userspace should be
* less than clusters in a group.
*/
if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
status = -EINVAL;
goto bail;
}
memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
ffg->iff_ffs.ffs_min = ~0U;
ffg->iff_ffs.ffs_clusters =
le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
rec = &(cl->cl_recs[i]);
status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
gb_dinode,
rec, ffg,
chunks_in_group);
if (status)
goto bail;
}
if (ffg->iff_ffs.ffs_free_chunks_real)
ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
ffg->iff_ffs.ffs_free_chunks_real);
bail:
if (unlock)
ocfs2_inode_unlock(gb_inode, 0);
if (gb_inode)
mutex_unlock(&gb_inode->i_mutex);
if (gb_inode)
iput(gb_inode);
brelse(bh);
return status;
}
int ocfs2_info_handle_freefrag(struct inode *inode,
struct ocfs2_info_request __user *req)
{
u64 blkno = -1;
char namebuf[40];
int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
struct ocfs2_info_freefrag *oiff;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *gb_inode = NULL;
oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
if (!oiff) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
if (o2info_from_user(*oiff, req))
goto bail;
/*
* chunksize from userspace should be power of 2.
*/
if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
(!oiff->iff_chunksize)) {
status = -EINVAL;
goto bail;
}
if (o2info_coherent(&oiff->iff_req)) {
gb_inode = ocfs2_get_system_file_inode(osb, type,
OCFS2_INVALID_SLOT);
if (!gb_inode) {
mlog(ML_ERROR, "unable to get global_bitmap inode\n");
status = -EIO;
goto bail;
}
} else {
ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
OCFS2_INVALID_SLOT);
status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
namebuf,
strlen(namebuf),
&blkno);
if (status < 0) {
status = -ENOENT;
goto bail;
}
}
status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
if (status < 0)
goto bail;
o2info_set_request_filled(&oiff->iff_req);
if (o2info_to_user(*oiff, req))
goto bail;
status = 0;
bail:
if (status)
o2info_set_request_error(&oiff->iff_req, req);
kfree(oiff);
return status;
}
@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
if (o2info_from_user(oir, req))
goto bail;
o2info_clear_request_filled(oir);
o2info_clear_request_filled(&oir);
if (o2info_to_user(oir, req))
goto bail;
@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
status = 0;
bail:
if (status)
o2info_set_request_error(oir, req);
o2info_set_request_error(&oir, req);
return status;
}
@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
status = ocfs2_info_handle_journal_size(inode, req);
break;
case OCFS2_INFO_FREEINODE:
if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
status = ocfs2_info_handle_freeinode(inode, req);
break;
case OCFS2_INFO_FREEFRAG:
if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
status = ocfs2_info_handle_freefrag(inode, req);
break;
default:
status = ocfs2_info_handle_unknown(inode, req);
break;
@ -542,6 +952,31 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -EFAULT;
return ocfs2_info_handle(inode, &info, 0);
case FITRIM:
{
struct super_block *sb = inode->i_sb;
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user(&range, (struct fstrim_range *)arg,
sizeof(range)))
return -EFAULT;
ret = ocfs2_trim_fs(sb, &range);
if (ret < 0)
return ret;
if (copy_to_user((struct fstrim_range *)arg, &range,
sizeof(range)))
return -EFAULT;
return 0;
}
case OCFS2_IOC_MOVE_EXT:
return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
default:
return -ENOTTY;
}
@ -569,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC_GROUP_EXTEND:
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
case FITRIM:
break;
case OCFS2_IOC_REFLINK:
if (copy_from_user(&args, (struct reflink_arguments *)arg,
@ -584,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return -EFAULT;
return ocfs2_info_handle(inode, &info, 1);
case OCFS2_IOC_MOVE_EXT:
break;
default:
return -ENOIOCTLCMD;
}

1153
fs/ocfs2/move_extents.c Normal file

File diff suppressed because it is too large Load diff

22
fs/ocfs2/move_extents.h Normal file
View file

@ -0,0 +1,22 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* move_extents.h
*
* Copyright (C) 2011 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef OCFS2_MOVE_EXTENTS_H
#define OCFS2_MOVE_EXTENTS_H
int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
#endif /* OCFS2_MOVE_EXTENTS_H */

View file

@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
__u64 ij_journal_size;
};
struct ocfs2_info_freeinode {
struct ocfs2_info_request ifi_req;
struct ocfs2_info_local_freeinode {
__u64 lfi_total;
__u64 lfi_free;
} ifi_stat[OCFS2_MAX_SLOTS];
__u32 ifi_slotnum; /* out */
__u32 ifi_pad;
};
#define OCFS2_INFO_MAX_HIST (32)
struct ocfs2_info_freefrag {
struct ocfs2_info_request iff_req;
struct ocfs2_info_freefrag_stats { /* (out) */
struct ocfs2_info_free_chunk_list {
__u32 fc_chunks[OCFS2_INFO_MAX_HIST];
__u32 fc_clusters[OCFS2_INFO_MAX_HIST];
} ffs_fc_hist;
__u32 ffs_clusters;
__u32 ffs_free_clusters;
__u32 ffs_free_chunks;
__u32 ffs_free_chunks_real;
__u32 ffs_min; /* Minimum free chunksize in clusters */
__u32 ffs_max;
__u32 ffs_avg;
__u32 ffs_pad;
} iff_ffs;
__u32 iff_chunksize; /* chunksize in clusters(in) */
__u32 iff_pad;
};
/* Codes for ocfs2_info_request */
enum ocfs2_info_type {
OCFS2_INFO_CLUSTERSIZE = 1,
@ -151,6 +183,8 @@ enum ocfs2_info_type {
OCFS2_INFO_UUID,
OCFS2_INFO_FS_FEATURES,
OCFS2_INFO_JOURNAL_SIZE,
OCFS2_INFO_FREEINODE,
OCFS2_INFO_FREEFRAG,
OCFS2_INFO_NUM_TYPES
};
@ -171,4 +205,38 @@ enum ocfs2_info_type {
#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
struct ocfs2_move_extents {
/* All values are in bytes */
/* in */
__u64 me_start; /* Virtual start in the file to move */
__u64 me_len; /* Length of the extents to be moved */
__u64 me_goal; /* Physical offset of the goal,
it's in block unit */
__u64 me_threshold; /* Maximum distance from goal or threshold
for auto defragmentation */
__u64 me_flags; /* Flags for the operation:
* - auto defragmentation.
* - refcount,xattr cases.
*/
/* out */
__u64 me_moved_len; /* Moved/defraged length */
__u64 me_new_offset; /* Resulting physical location */
__u32 me_reserved[2]; /* Reserved for futhure */
};
#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
claim new clusters
as the goal place
for extents moving */
#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
moving, is to make
movement less likely
to fail, may make fs
even more fragmented */
#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
completely gets done.
*/
#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
#endif /* OCFS2_IOCTL_H */

View file

@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
__entry->blkno, __entry->bit)
);
TRACE_EVENT(ocfs2_trim_extent,
TP_PROTO(struct super_block *sb, unsigned long long blk,
unsigned long long count),
TP_ARGS(sb, blk, count),
TP_STRUCT__entry(
__field(int, dev_major)
__field(int, dev_minor)
__field(unsigned long long, blk)
__field(__u64, count)
),
TP_fast_assign(
__entry->dev_major = MAJOR(sb->s_dev);
__entry->dev_minor = MINOR(sb->s_dev);
__entry->blk = blk;
__entry->count = count;
),
TP_printk("%d %d %llu %llu",
__entry->dev_major, __entry->dev_minor,
__entry->blk, __entry->count)
);
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
/* End of trace events for fs/ocfs2/alloc.c. */
/* Trace events for fs/ocfs2/localalloc.c. */

View file

@ -66,7 +66,7 @@ struct ocfs2_cow_context {
u32 *num_clusters,
unsigned int *extent_flags);
int (*cow_duplicate_clusters)(handle_t *handle,
struct ocfs2_cow_context *context,
struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
};
@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
return 0;
}
static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
struct ocfs2_cow_context *context,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
int ocfs2_duplicate_clusters_by_page(handle_t *handle,
struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
{
int ret = 0, partial;
struct ocfs2_caching_info *ci = context->data_et.et_ci;
struct inode *inode = file->f_path.dentry->d_inode;
struct ocfs2_caching_info *ci = INODE_CACHE(inode);
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
unsigned int from, to, readahead_pages;
loff_t offset, end, map_end;
struct address_space *mapping = context->inode->i_mapping;
struct address_space *mapping = inode->i_mapping;
trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
new_cluster, new_len);
@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
* We only duplicate pages until we reach the page contains i_size - 1.
* So trim 'end' to i_size.
*/
if (end > i_size_read(context->inode))
end = i_size_read(context->inode);
if (end > i_size_read(inode))
end = i_size_read(inode);
while (offset < end) {
page_index = offset >> PAGE_CACHE_SHIFT;
@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
BUG_ON(PageDirty(page));
if (PageReadahead(page) && context->file) {
if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
&context->file->f_ra,
context->file,
&file->f_ra, file,
page, page_index,
readahead_pages);
}
@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
}
ocfs2_map_and_dirty_page(context->inode,
handle, from, to,
ocfs2_map_and_dirty_page(inode, handle, from, to,
page, 0, &new_block);
mark_page_accessed(page);
unlock:
@ -3015,14 +3014,15 @@ unlock:
return ret;
}
static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
struct ocfs2_cow_context *context,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
{
int ret = 0;
struct super_block *sb = context->inode->i_sb;
struct ocfs2_caching_info *ci = context->data_et.et_ci;
struct inode *inode = file->f_path.dentry->d_inode;
struct super_block *sb = inode->i_sb;
struct ocfs2_caching_info *ci = INODE_CACHE(inode);
int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
/*If the old clusters is unwritten, no need to duplicate. */
if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
ret = context->cow_duplicate_clusters(handle, context, cpos,
old, new, len);
ret = context->cow_duplicate_clusters(handle, context->file,
cpos, old, new, len);
if (ret) {
mlog_errno(ret);
goto out;
@ -3162,22 +3162,22 @@ out:
return ret;
}
static int ocfs2_cow_sync_writeback(struct super_block *sb,
struct ocfs2_cow_context *context,
u32 cpos, u32 num_clusters)
int ocfs2_cow_sync_writeback(struct super_block *sb,
struct inode *inode,
u32 cpos, u32 num_clusters)
{
int ret = 0;
loff_t offset, end, map_end;
pgoff_t page_index;
struct page *page;
if (ocfs2_should_order_data(context->inode))
if (ocfs2_should_order_data(inode))
return 0;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
ret = filemap_fdatawrite_range(context->inode->i_mapping,
ret = filemap_fdatawrite_range(inode->i_mapping,
offset, end - 1);
if (ret < 0) {
mlog_errno(ret);
@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
if (map_end > end)
map_end = end;
page = find_or_create_page(context->inode->i_mapping,
page = find_or_create_page(inode->i_mapping,
page_index, GFP_NOFS);
BUG_ON(!page);
@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
* in write-back mode.
*/
if (context->get_clusters == ocfs2_di_get_clusters) {
ret = ocfs2_cow_sync_writeback(sb, context, cpos,
ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
orig_num_clusters);
if (ret)
mlog_errno(ret);

View file

@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
struct buffer_head *ref_root_bh,
u32 cpos, u32 write_len,
struct ocfs2_post_refcount *post);
int ocfs2_duplicate_clusters_by_page(handle_t *handle,
struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
int ocfs2_cow_sync_writeback(struct super_block *sb,
struct inode *inode,
u32 cpos, u32 num_clusters);
int ocfs2_add_refcount_flag(struct inode *inode,
struct ocfs2_extent_tree *data_et,
struct ocfs2_caching_info *ref_ci,

View file

@ -1567,7 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (osb->preferred_slot != OCFS2_INVALID_SLOT)
seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
if (osb->osb_commit_interval)