Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor select command #617

Merged
merged 9 commits into from
Jun 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ xml-rs = "0.8"
pica-matcher = { version = "0.1", path = "pica-matcher" }
pica-path = { version = "0.2", path = "pica-path" }
pica-record = { version = "0.1", path = "pica-record" }
pica-select = { version = "0.1", path = "pica-select" }

[dev-dependencies]
assert_cmd = "2.0"
Expand Down
171 changes: 85 additions & 86 deletions src/bin/pica/commands/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@ use std::collections::BTreeSet;
use std::ffi::OsString;
use std::fs::OpenOptions;
use std::hash::{Hash, Hasher};
use std::io::{self, Read, Write};
use std::io::{self, Write};
use std::str::FromStr;

use clap::Parser;
use pica::matcher::{MatcherFlags, RecordMatcher};
use pica::{Outcome, Reader, ReaderBuilder, Selectors};
use pica_matcher::{MatcherOptions, RecordMatcher};
use pica_record::io::{ReaderBuilder, RecordsIterator};
use pica_select::{Query, QueryExt};
use serde::{Deserialize, Serialize};

use crate::config::Config;
use crate::skip_invalid_flag;
use crate::translit::{translit_maybe, translit_maybe2};
use crate::util::{CliError, CliResult};
use crate::util::CliResult;

#[derive(Debug, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case")]
Expand Down Expand Up @@ -69,8 +71,9 @@ pub(crate) struct Select {
#[arg(short, long, value_name = "filename")]
output: Option<OsString>,

/// Comma-separated list of selectors
selectors: String,
/// Query (comma-separated list of path expressions or string
/// literals)
query: String,

/// Read one or more files in normalized PICA+ format.
#[arg(default_value = "-", hide_default_value = true)]
Expand Down Expand Up @@ -103,101 +106,97 @@ impl Select {
);

let mut seen = BTreeSet::new();
let mut writer = csv::WriterBuilder::new()
.delimiter(if self.tsv { b'\t' } else { b',' })
.from_writer(writer(self.output, self.append)?);

let selectors = if let Some(ref global) = config.global {
translit_maybe2(&self.selectors, global.translit)
let options =
MatcherOptions::default().case_ignore(self.ignore_case);

let matcher = if let Some(matcher_str) = self.filter {
if let Some(ref global) = config.global {
Some(RecordMatcher::new(&translit_maybe2(
&matcher_str,
global.translit,
))?)
} else {
Some(RecordMatcher::new(&matcher_str)?)
}
} else {
self.selectors.to_string()
None
};

let selectors = match Selectors::decode(&selectors) {
Ok(val) => val,
_ => {
return Err(CliError::Other(format!(
"invalid select list: {}",
self.selectors
)))
}
let query = if let Some(ref global) = config.global {
Query::from_str(&translit_maybe2(
&self.query,
global.translit,
))?
} else {
Query::from_str(&self.query)?
};

let mut writer = csv::WriterBuilder::new()
.delimiter(if self.tsv { b'\t' } else { b',' })
.from_writer(writer(self.output, self.append)?);

if let Some(header) = self.header {
writer.write_record(header.split(',').map(|s| s.trim()))?;
}

let flags = MatcherFlags::default();
let filter = match self.filter {
Some(filter_str) => match RecordMatcher::new(&filter_str) {
Ok(f) => f,
_ => {
return Err(CliError::Other(format!(
"invalid filter: \"{filter_str}\""
)))
}
},
None => RecordMatcher::True,
};

for filename in self.filenames {
let builder =
ReaderBuilder::new().skip_invalid(skip_invalid);
let mut reader: Reader<Box<dyn Read>> = match filename
.to_str()
{
Some("-") => builder.from_reader(Box::new(io::stdin())),
_ => builder.from_path(filename)?,
};

for result in reader.records() {
let record = result?;

if !filter.is_match(&record, &flags) {
continue;
}

let outcome = selectors
.iter()
.map(|selector| {
record.select(selector, self.ignore_case)
})
.fold(Outcome::default(), |acc, x| acc * x);

for row in outcome.iter() {
if self.no_empty_columns
&& row.iter().any(|column| column.is_empty())
{
continue;
}

if self.unique {
let mut hasher = DefaultHasher::new();
row.hash(&mut hasher);
let hash = hasher.finish();
let mut reader =
ReaderBuilder::new().from_path(filename)?;

if seen.contains(&hash) {
while let Some(result) = reader.next() {
match result {
Err(e) => {
if e.is_invalid_record() && skip_invalid {
continue;
} else {
return Err(e.into());
}

seen.insert(hash);
}
Ok(record) => {
if let Some(ref matcher) = matcher {
if !matcher.is_match(&record, &options) {
continue;
}
}

if !row.iter().all(|col| col.is_empty()) {
if self.translit.is_some() {
writer.write_record(
row.iter()
.map(ToString::to_string)
.map(|s| {
translit_maybe(
&s,
self.translit.as_deref(),
)
}),
)?;
} else {
writer.write_record(row)?;
};
let outcome = record.query(&query, &options);
for row in outcome.iter() {
if self.no_empty_columns
&& row
.iter()
.any(|column| column.is_empty())
{
continue;
}

if self.unique {
let mut hasher = DefaultHasher::new();
row.hash(&mut hasher);
let hash = hasher.finish();

if seen.contains(&hash) {
continue;
}

seen.insert(hash);
}

if !row.iter().all(|col| col.is_empty()) {
if self.translit.is_some() {
writer.write_record(
row.iter().map(|s| {
translit_maybe(
s,
self.translit
.as_deref(),
)
}),
)?;
} else {
writer.write_record(row)?;
};
}
}
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/bin/pica/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ fn main() {
eprintln!("Parse Matcher Error: {err}");
process::exit(1);
}
Err(CliError::ParseQuery(err)) => {
eprintln!("Parse Query Error: {err}");
process::exit(1);
}
Err(CliError::Pica(err)) => {
eprintln!("Pica Error: {err}");
process::exit(1);
Expand Down
8 changes: 8 additions & 0 deletions src/bin/pica/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub(crate) enum CliError {
ParsePica(pica_record::ParsePicaError),
ParsePath(pica_path::ParsePathError),
ParseMatcher(pica_matcher::ParseMatcherError),
ParseQuery(pica_select::ParseQueryError),
Other(String),
}

Expand All @@ -25,6 +26,7 @@ impl fmt::Display for CliError {
CliError::ParsePica(ref e) => e.fmt(f),
CliError::ParsePath(ref e) => e.fmt(f),
CliError::ParseMatcher(ref e) => e.fmt(f),
CliError::ParseQuery(ref e) => e.fmt(f),
CliError::Other(ref s) => f.write_str(s),
}
}
Expand Down Expand Up @@ -76,3 +78,9 @@ impl From<pica_matcher::ParseMatcherError> for CliError {
CliError::ParseMatcher(err)
}
}

impl From<pica_select::ParseQueryError> for CliError {
fn from(err: pica_select::ParseQueryError) -> Self {
CliError::ParseQuery(err)
}
}
2 changes: 0 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ pub use self::parser::{ParsePathError, ParsePicaError};
pub use self::path::Path;
pub use self::reader::{Reader, ReaderBuilder};
pub use self::record::{ByteRecord, StringRecord};
pub use self::select::{Outcome, Selector, Selectors};
pub use self::subfield::Subfield;
pub use self::tag::{Level, Tag};
pub use self::writer::{
Expand All @@ -27,7 +26,6 @@ mod parser;
mod path;
mod reader;
mod record;
mod select;
mod subfield;
mod tag;
#[cfg(test)]
Expand Down
78 changes: 1 addition & 77 deletions src/record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ use bstr::BString;
use serde::ser::{Serialize, SerializeStruct, Serializer};

use crate::error::Result;
use crate::matcher::{MatcherFlags, OccurrenceMatcher, TagMatcher};
use crate::matcher::{OccurrenceMatcher, TagMatcher};
use crate::parser::{parse_fields, ParsePicaError};
use crate::select::{Outcome, Selector};
use crate::{Field, Path};

/// A PICA+ record, that may contian invalid UTF-8 data.
Expand Down Expand Up @@ -272,81 +271,6 @@ impl ByteRecord {
.collect()
}

pub fn select(
&self,
selector: &Selector,
ignore_case: bool,
) -> Outcome {
match selector {
Selector::Value(value) => {
Outcome::from_values(vec![BString::from(
value.as_bytes(),
)])
}
Selector::Field(selector) => {
let result = self
.iter()
.filter(|field| selector.tag.is_match(field.tag()))
.filter(|field| {
selector.occurrence.is_match(field.occurrence())
})
.filter(|field| {
if let Some(filter) = &selector.filter {
filter.is_match(
field,
&MatcherFlags {
ignore_case,
strsim_threshold: 0.0,
},
)
} else {
true
}
})
.map(|field| &field.subfields)
.map(|subfields| {
selector
.subfields
.iter()
.map(|code| {
subfields
.iter()
.filter(|subfield| {
subfield.code == *code
})
.map(|subfield| {
vec![subfield
.value()
.to_owned()]
})
.collect::<Vec<Vec<BString>>>()
})
.map(|x| {
if x.is_empty() {
Outcome::one()
} else {
Outcome(x)
}
})
.fold(Outcome::default(), |acc, x| acc * x)
})
.fold(Outcome::default(), |acc, x| acc + x);

if result.is_empty() {
let mut values: Vec<BString> =
Vec::with_capacity(selector.subfields.len());
for _ in 0..selector.subfields.len() {
values.push(BString::from(""));
}

Outcome::from_values(values)
} else {
result
}
}
}
}

/// Reduce the record to the given fields.
///
/// # Example
Expand Down
Loading