1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
use std::{ffi::OsStr, io, path::Path};
use bstr::io::BufReadExt;
use crate::escape::{escape, escape_os};
/// An error that occurs when a pattern could not be converted to valid UTF-8.
///
/// The purpose of this error is to give a more targeted failure mode for
/// patterns written by end users that are not valid UTF-8.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct InvalidPatternError {
original: String,
valid_up_to: usize,
}
impl InvalidPatternError {
/// Returns the index in the given string up to which valid UTF-8 was
/// verified.
pub fn valid_up_to(&self) -> usize {
self.valid_up_to
}
}
impl std::error::Error for InvalidPatternError {}
impl std::fmt::Display for InvalidPatternError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"found invalid UTF-8 in pattern at byte offset {}: {} \
(disable Unicode mode and use hex escape sequences to match \
arbitrary bytes in a pattern, e.g., '(?-u)\\xFF')",
self.valid_up_to, self.original,
)
}
}
impl From<InvalidPatternError> for io::Error {
fn from(paterr: InvalidPatternError) -> io::Error {
io::Error::new(io::ErrorKind::Other, paterr)
}
}
/// Convert an OS string into a regular expression pattern.
///
/// This conversion fails if the given pattern is not valid UTF-8, in which
/// case, a targeted error with more information about where the invalid UTF-8
/// occurs is given. The error also suggests the use of hex escape sequences,
/// which are supported by many regex engines.
pub fn pattern_from_os(pattern: &OsStr) -> Result<&str, InvalidPatternError> {
pattern.to_str().ok_or_else(|| {
let valid_up_to = pattern
.to_string_lossy()
.find('\u{FFFD}')
.expect("a Unicode replacement codepoint for invalid UTF-8");
InvalidPatternError { original: escape_os(pattern), valid_up_to }
})
}
/// Convert arbitrary bytes into a regular expression pattern.
///
/// This conversion fails if the given pattern is not valid UTF-8, in which
/// case, a targeted error with more information about where the invalid UTF-8
/// occurs is given. The error also suggests the use of hex escape sequences,
/// which are supported by many regex engines.
pub fn pattern_from_bytes(
pattern: &[u8],
) -> Result<&str, InvalidPatternError> {
std::str::from_utf8(pattern).map_err(|err| InvalidPatternError {
original: escape(pattern),
valid_up_to: err.valid_up_to(),
})
}
/// Read patterns from a file path, one per line.
///
/// If there was a problem reading or if any of the patterns contain invalid
/// UTF-8, then an error is returned. If there was a problem with a specific
/// pattern, then the error message will include the line number and the file
/// path.
pub fn patterns_from_path<P: AsRef<Path>>(path: P) -> io::Result<Vec<String>> {
let path = path.as_ref();
let file = std::fs::File::open(path).map_err(|err| {
io::Error::new(
io::ErrorKind::Other,
format!("{}: {}", path.display(), err),
)
})?;
patterns_from_reader(file).map_err(|err| {
io::Error::new(
io::ErrorKind::Other,
format!("{}:{}", path.display(), err),
)
})
}
/// Read patterns from stdin, one per line.
///
/// If there was a problem reading or if any of the patterns contain invalid
/// UTF-8, then an error is returned. If there was a problem with a specific
/// pattern, then the error message will include the line number and the fact
/// that it came from stdin.
pub fn patterns_from_stdin() -> io::Result<Vec<String>> {
let stdin = io::stdin();
let locked = stdin.lock();
patterns_from_reader(locked).map_err(|err| {
io::Error::new(io::ErrorKind::Other, format!("<stdin>:{}", err))
})
}
/// Read patterns from any reader, one per line.
///
/// If there was a problem reading or if any of the patterns contain invalid
/// UTF-8, then an error is returned. If there was a problem with a specific
/// pattern, then the error message will include the line number.
///
/// Note that this routine uses its own internal buffer, so the caller should
/// not provide their own buffered reader if possible.
///
/// # Example
///
/// This shows how to parse patterns, one per line.
///
/// ```
/// use grep_cli::patterns_from_reader;
///
/// let patterns = "\
/// foo
/// bar\\s+foo
/// [a-z]{3}
/// ";
///
/// assert_eq!(patterns_from_reader(patterns.as_bytes())?, vec![
/// r"foo",
/// r"bar\s+foo",
/// r"[a-z]{3}",
/// ]);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn patterns_from_reader<R: io::Read>(rdr: R) -> io::Result<Vec<String>> {
let mut patterns = vec![];
let mut line_number = 0;
io::BufReader::new(rdr).for_byte_line(|line| {
line_number += 1;
match pattern_from_bytes(line) {
Ok(pattern) => {
patterns.push(pattern.to_string());
Ok(true)
}
Err(err) => Err(io::Error::new(
io::ErrorKind::Other,
format!("{}: {}", line_number, err),
)),
}
})?;
Ok(patterns)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bytes() {
let pat = b"abc\xFFxyz";
let err = pattern_from_bytes(pat).unwrap_err();
assert_eq!(3, err.valid_up_to());
}
#[test]
#[cfg(unix)]
fn os() {
use std::ffi::OsStr;
use std::os::unix::ffi::OsStrExt;
let pat = OsStr::from_bytes(b"abc\xFFxyz");
let err = pattern_from_os(pat).unwrap_err();
assert_eq!(3, err.valid_up_to());
}
}