Accelerated get that doesn't traverse sub-indices; roadmap update

This commit is contained in:
Josh Hansen 2023-08-30 16:17:10 -07:00
parent f4403379bb
commit 0f605d1fe8
7 changed files with 202 additions and 91 deletions

View file

@ -31,14 +31,16 @@
-[x] `ls` to show Ghee's view of the world
-[x] `init` to initialize a directory as a full-fledged Ghee table (with specified key names)
-[x] Sensible default destination for `idx`
-[ ] Make `get` stop listing nested indices
-[ ] Make `get` accelerate traversal using the best available index (a la `del`)
-[x] Make `get` stop listing nested indices
-[x] Make `get` accelerate traversal using the best available index (a la `del`)
-[ ] Make `get` return paths from original index rather than the one used to accelerate
-[x] Fix output order, making commands reproducible
## 0.5
-[ ] `create` to `init` and `ins` simultaneously
-[ ] Integrate with Btrfs snapshots?
-[ ] Integrate with Btrfs / Bcachefs / ZFS snapshots?
-[ ] Unit test remaining commands
-[ ] Existence predicates, e.g. `-w name` matches any record for which `name` is set
## Future

View file

@ -22,9 +22,7 @@ Linked ./people/:state:id/NM/4 -> ./people/Lilly
Initialized ./people/Darrel
Linked ./people:id/5 -> ./people/Darrel
Linked ./people/:state:id/MI/5 -> ./people/Darrel
+ ghee get -w state=CA -f name ./people
./people/Sandeep user.name Sandeep
./people/Wulfrum user.name Wulfrum
+ ghee get -a -w state=CA -f name ./people
./people/:state:id/CA/0 user.name Wulfrum
./people/:state:id/CA/2 user.name Sandeep
+ ghee del -v ./people -w name=Sofia
@ -37,6 +35,6 @@ Removed ./people/Janella
+ ghee del -v ./people/:state:id CA 0
Removed ./people/Wulfrum
Removed ./people/:state:id/CA/0
+ ghee get -w state=CA -f name ./people/:state:id
+ ghee get -a -w state=CA -f name ./people/:state:id
./people/:state:id/CA/2 user.name Sandeep
+ cd ..

View file

@ -78,6 +78,8 @@ enum Commands {
where_: Vec<Predicate>,
#[arg(long, help = "Process paths nonrecursively; defaults to false")]
flat: bool,
#[arg(short, long, help = "Include user.ghee prefix in output")]
all: bool,
},
/// Set xattr values
@ -244,7 +246,10 @@ fn run_command(cmd: &Commands) {
json,
where_,
flat,
} => get(paths, fields, *json, where_, !*flat),
all,
} => get(paths, fields, *json, where_, !*flat, *all).unwrap_or_else(|e| {
panic!("Error getting record(s): {}", e);
}),
Commands::Set {
paths,
field_assignments,

View file

@ -81,7 +81,10 @@ pub fn del(
debug_assert!(!where_.is_empty());
walk(
PathOrIndices::Indices(&all_indices),
PathOrIndices::PathAndIndices {
path: table_path,
indices: &all_indices,
},
where_,
true,
false,

View file

@ -1,9 +1,9 @@
use anyhow::Result;
use serde::Serialize;
use walkdir::WalkDir;
use crate::{
parser::{predicate::Predicate, value::Value, xattr::Xattr},
xattr_values,
walk, PathOrIndices,
};
use std::{collections::BTreeMap, io::Write, path::PathBuf};
@ -20,69 +20,66 @@ pub fn get(
json: bool,
where_: &Vec<Predicate>,
recursive: bool,
) {
let max_depth = if recursive { usize::MAX } else { 0 };
let paths = paths
.iter()
.flat_map(|p| WalkDir::new(p).max_depth(max_depth))
.map(|e| e.unwrap().into_path());
'outer: for path in paths {
let all_field_values = xattr_values(&path).unwrap();
for where_clause in where_ {
if !where_clause.satisfied(&all_field_values) {
continue 'outer;
}
}
// Fields that will be output
let projected_fields: Vec<Xattr> = if fields.is_empty() {
all_field_values.keys().cloned().collect()
} else {
fields.clone()
};
if json {
let mut xattrs: BTreeMap<&Xattr, &Value> = BTreeMap::new();
for field in projected_fields.iter() {
let value = &all_field_values[field];
xattrs.insert(field, value);
}
if !xattrs.is_empty() {
let file_xattrs = FileXattrs {
path: path.to_string_lossy().to_string(),
xattrs,
all: bool,
) -> Result<()> {
for path in paths {
walk(
PathOrIndices::Path(path),
where_,
recursive,
all,
&|record| {
// Fields that will be output
let projected_fields: Vec<Xattr> = if fields.is_empty() {
record.xattr_values.keys().cloned().collect()
} else {
fields.clone()
};
if json {
let mut xattrs: BTreeMap<&Xattr, &Value> = BTreeMap::new();
println!(
"{}",
serde_json::to_string(&file_xattrs)
.unwrap_or_else(|e| panic!("Could not serialize as JSON: {}", e))
);
}
} else {
for field in projected_fields.iter() {
if let Some(value) = all_field_values.get(field) {
print!("{}\t{}\t", path.display(), field);
for field in projected_fields.iter() {
let value = &record.xattr_values[field];
{
let mut stdout = std::io::stdout();
stdout
.write(value.as_bytes().as_slice())
.unwrap_or_else(|e| {
panic!(
"Could not write xattr {} value {:?} to stdout: {}",
field, value, e
)
});
xattrs.insert(field, value);
}
if !xattrs.is_empty() {
let file_xattrs = FileXattrs {
path: record.path.to_string_lossy().to_string(),
xattrs,
};
println!(
"{}",
serde_json::to_string(&file_xattrs)
.unwrap_or_else(|e| panic!("Could not serialize as JSON: {}", e))
);
}
} else {
for field in projected_fields.iter() {
if let Some(value) = record.xattr_values.get(field) {
print!("{}\t{}\t", record.path.display(), field);
{
let mut stdout = std::io::stdout();
stdout
.write(value.as_bytes().as_slice())
.unwrap_or_else(|e| {
panic!(
"Could not write xattr {} value {:?} to stdout: {}",
field, value, e
)
});
}
println!();
}
}
println!();
}
}
}
Ok(())
},
)?;
}
Ok(())
}

View file

@ -116,7 +116,7 @@ mod test {
use crate::{
cmd::init,
get_index_info, get_key,
get_index_info, get_key, indices,
parser::{index::IndexInfo, key::Key},
};
@ -128,10 +128,14 @@ mod test {
let dir2 = TempDir::new("ghee-test-idx:2").unwrap().into_path();
let dir3 = TempDir::new("ghee-test-idx:3").unwrap().into_path();
let key1 = Key::from(vec!["test1"]);
let key2 = Key::from(vec!["test2"]);
let key3 = Key::from_string("test3");
init(&dir1, &key1, false).unwrap();
idx(&dir1, Some(&dir2), &key2, false);
@ -157,6 +161,17 @@ mod test {
);
let idx_key = get_key(&dir2).unwrap();
assert_eq!(idx_key, Some(key2));
assert_eq!(idx_key.as_ref(), Some(&key2));
idx(&dir1, Some(&dir3), &key3, false);
// Make sure the indices are updated properly after a second index
// (no overwriting of the previous)
let indices = indices(&dir1).unwrap();
assert_eq!(indices.len(), 3);
assert!(indices.contains_key(&key1));
assert!(indices.contains_key(&key2));
assert!(indices.contains_key(&key3));
}
}

View file

@ -195,31 +195,60 @@ pub fn indices(path: &PathBuf) -> Result<BTreeMap<Key, PathBuf>> {
/// Get the index which places the predicate xattrs earliest in its primary key order
/// The idea is that this will maximally speed up traversal of records, but this may
/// depend on the cardinality / distribution of the subkey values
pub fn best_index<'a>(
pub fn best_index<'a, 'b>(
indices: &'a BTreeMap<Key, PathBuf>,
where_: &Vec<Predicate>,
tie_breaker_key: &'a Key,
) -> (&'a Key, &'a PathBuf) {
let predicate_xattrs: Vec<Xattr> = where_.iter().map(|pred| pred.xattr.clone()).collect();
indices
.iter()
.min_by_key(|(key, _path)| {
predicate_xattrs
let earliest_subkey_indices: Vec<Option<usize>> = indices
.keys()
.map(|key| {
let x: Option<usize> = predicate_xattrs
.iter()
.map(|xattr| {
key.subkeys
.iter()
.position(|subkey| *subkey == *xattr)
.unwrap_or(UNINDEXED_PREDICATE_PENALTY)
.map(|xattr| key.subkeys.iter().position(|subkey| *subkey == *xattr))
.reduce(|a, b| {
if let Some(a) = a {
if let Some(b) = b {
Some(a + b)
} else {
Some(a)
}
} else {
if let Some(b) = b {
Some(b)
} else {
None
}
}
})
.reduce(|a, b| a + b)
.unwrap_or(None);
x
})
.unwrap()
.collect();
if earliest_subkey_indices.iter().all(|idx| idx.is_none()) {
let path = &indices[tie_breaker_key];
(tie_breaker_key, path)
} else {
indices
.iter()
.enumerate()
.min_by_key(|(idx, (_key, _path))| {
earliest_subkey_indices[*idx].unwrap_or(UNINDEXED_PREDICATE_PENALTY)
})
.unwrap()
.1
}
}
pub enum PathOrIndices<'a, 'b> {
Path(&'a PathBuf),
Indices(&'b BTreeMap<Key, PathBuf>),
PathAndIndices {
path: &'a PathBuf,
indices: &'b BTreeMap<Key, PathBuf>,
},
}
/**
@ -252,12 +281,14 @@ pub fn walk<F: Fn(PathVisit) -> Result<()>>(
None
};
let indices = match path_or_table_indices {
PathOrIndices::Path(_path) => loaded_indices.as_ref().unwrap(),
PathOrIndices::Indices(indices) => indices,
let (path, indices) = match path_or_table_indices {
PathOrIndices::Path(path) => (path, loaded_indices.as_ref().unwrap()),
PathOrIndices::PathAndIndices { path, indices } => (path, indices),
};
let (key, path) = best_index(indices, where_);
let key = indices.iter().find(|(_key, p)| *p == path).unwrap().0;
let (key, path) = best_index(indices, where_, key);
let path_len = path.components().count();
@ -272,6 +303,11 @@ pub fn walk<F: Fn(PathVisit) -> Result<()>>(
return true;
}
// Always ignore nested indices
if is_hidden(e) {
return false;
}
let values = xattr_values_from_path(&key, &path, &path).unwrap();
// Of all xattr values set on this path, if a relevant predicate is contradicted, proceed no further
@ -485,11 +521,13 @@ mod test {
use tempdir::TempDir;
use crate::{
cmd::{init, set},
get_index_info, get_key, index_list_push,
best_index,
cmd::{idx, init, set},
get_index_info, get_key, index_list_push, indices,
parser::{
assignment::parse_assignment,
key::Key,
predicate::parse_predicate,
xattr::{parse_xattr, Namespace, Xattr},
},
set_key, walk, PathOrIndices,
@ -623,4 +661,57 @@ mod test {
assert!(visited_recursive.contains(&dir1));
assert!(visited_recursive.contains(&dir2));
}
#[test]
fn test_best_index() {
let dir1 = TempDir::new("ghee-test-best-index-dir1")
.unwrap()
.into_path();
let key1 = Key::from_string("test");
init(&dir1, &key1, false).unwrap();
let dir2 = TempDir::new("ghee-test-best-index-dir2")
.unwrap()
.into_path();
let key2 = Key::from_string("blah,test");
idx(&dir1, Some(&dir2), &key2, false);
let indices = indices(&dir1).unwrap();
{
// No predicate; should fall back to tie breaker
let (best_key, best_path) = best_index(&indices, &vec![], &key1);
assert_eq!(best_key, &key1);
assert_eq!(best_path, &dir1);
}
{
// Predicate "test" makes dir1 best
let (best_key, best_path) = best_index(
&indices,
&vec![parse_predicate(b"test=5").unwrap().1],
&key1,
);
assert_eq!(best_key, &key1);
assert_eq!(best_path, &dir1);
}
{
// Predicate "blah" makes dir2 best
let (best_key, best_path) = best_index(
&indices,
&vec![parse_predicate(b"blah=6").unwrap().1],
&key1,
);
assert_eq!(best_key, &key2);
assert_eq!(best_path, &dir2);
}
}
}