Graphite/node-graph/nodes/text/src/regex.rs

231 lines
7.9 KiB
Rust

use core_types::registry::types::SignedInteger;
use core_types::table::{Table, TableRow};
use core_types::{ATTR_END, ATTR_NAME, ATTR_START, Ctx};
/// Checks whether the string contains a match for the given regular expression pattern. Optionally restricts the match to only the start and/or end of the string.
#[node_macro::node(category("Text: Regex"))]
fn regex_contains(
_: impl Ctx,
/// The string to search within.
string: String,
/// The regular expression pattern to search for.
pattern: String,
/// Match letters regardless of case.
case_insensitive: bool,
/// Make `^` and `$` match the start and end of each line, not just the whole string.
multiline: bool,
/// Only match if the pattern appears at the start of the string.
at_start: bool,
/// Only match if the pattern appears at the end of the string.
at_end: bool,
) -> bool {
let flags = match (case_insensitive, multiline) {
(false, false) => "",
(true, false) => "(?i)",
(false, true) => "(?m)",
(true, true) => "(?im)",
};
let anchored_pattern = match (at_start, at_end) {
(true, true) => format!("{flags}\\A(?:{pattern})\\z"),
(true, false) => format!("{flags}\\A(?:{pattern})"),
(false, true) => format!("{flags}(?:{pattern})\\z"),
(false, false) => format!("{flags}{pattern}"),
};
let Ok(regex) = fancy_regex::Regex::new(&anchored_pattern) else {
log::error!("Invalid regex pattern: {pattern}");
return false;
};
regex.is_match(&string).unwrap_or(false)
}
/// Replaces matches of a regular expression pattern in the string. The replacement string can reference captures: `$0` for the whole match and `$1`, `$2`, etc. for capture groups.
#[node_macro::node(category("Text: Regex"))]
fn regex_replace(
_: impl Ctx,
string: String,
/// The regular expression pattern to search for.
pattern: String,
/// The replacement string. Use `$0` for the whole match and `$1`, `$2`, etc. for capture groups.
replacement: String,
/// Replace all matches. When disabled, only the first match is replaced.
#[default(true)]
replace_all: bool,
/// Match letters regardless of case.
case_insensitive: bool,
/// Make `^` and `$` match the start and end of each line, not just the whole string.
multiline: bool,
) -> String {
let flags = match (case_insensitive, multiline) {
(false, false) => "",
(true, false) => "(?i)",
(false, true) => "(?m)",
(true, true) => "(?im)",
};
let full_pattern = format!("{flags}{pattern}");
let Ok(regex) = fancy_regex::Regex::new(&full_pattern) else {
log::warn!("Invalid regex pattern: {pattern}");
return string;
};
if replace_all {
regex.replace_all(&string, replacement.as_str()).into_owned()
} else {
regex.replace(&string, replacement.as_str()).into_owned()
}
}
/// Finds a regex match in the string and returns its components. The result is a list where the first item is the whole match (`$0`) and subsequent items are the capture groups (`$1`, `$2`, etc., if any).
///
/// The match index selects which non-overlapping occurrence to return (0 for the first match). Returns an empty list if no match is found at the given index.
///
/// Each item carries `start` and `end` byte-offset attributes pointing into the original string, plus a `name` attribute holding
/// the capture group's name (empty for unnamed groups, and for index 0 which is the whole match).
#[node_macro::node(category(""))]
fn regex_find(
_: impl Ctx,
/// The string to search within.
string: String,
/// The regular expression pattern to search for.
pattern: String,
/// Which non-overlapping occurrence of the pattern to return, starting from 0 for the first match. Negative indices count backwards from the last match.
match_index: SignedInteger,
/// Match letters regardless of case.
case_insensitive: bool,
/// Make `^` and `$` match the start and end of each line, not just the whole string.
multiline: bool,
) -> Table<String> {
if pattern.is_empty() {
return Table::new();
}
let flags = match (case_insensitive, multiline) {
(false, false) => "",
(true, false) => "(?i)",
(false, true) => "(?m)",
(true, true) => "(?im)",
};
let full_pattern = format!("{flags}{pattern}");
let Ok(regex) = fancy_regex::Regex::new(&full_pattern) else {
log::error!("Invalid regex pattern: {pattern}");
return Table::new();
};
// Capture group names indexed positionally; index 0 (the whole match) is always None.
let capture_names: Vec<Option<String>> = regex.capture_names().map(|name| name.map(str::to_string)).collect();
// Collect all matches since we need to support negative indexing
let matches: Vec<_> = regex.captures_iter(&string).filter_map(|c| c.ok()).collect();
let match_index = match_index as i32;
let resolved_index = if match_index < 0 {
let from_end = (-match_index) as usize;
if from_end > matches.len() {
return Table::new();
}
matches.len() - from_end
} else {
match_index as usize
};
let Some(captures) = matches.get(resolved_index) else {
return Table::new();
};
// Index 0 is the whole match, 1+ are capture groups
(0..captures.len())
.map(|i| {
let captured = captures.get(i);
let text = captured.map_or(String::new(), |m| m.as_str().to_string());
let start = captured.map_or(0_u64, |m| m.start() as u64);
let end = captured.map_or(0_u64, |m| m.end() as u64);
let name = capture_names.get(i).cloned().flatten().unwrap_or_default();
TableRow::new_from_element(text)
.with_attribute(ATTR_START, start)
.with_attribute(ATTR_END, end)
.with_attribute(ATTR_NAME, name)
})
.collect()
}
/// Finds all non-overlapping matches of a regular expression pattern in the string, returning a list of the matched substrings.
///
/// Each item carries `start` and `end` byte-offset attributes pointing into the original string.
#[node_macro::node(category("Text: Regex"))]
fn regex_find_all(
_: impl Ctx,
/// The string to search within.
string: String,
/// The regular expression pattern to search for.
pattern: String,
/// Match letters regardless of case.
case_insensitive: bool,
/// Make `^` and `$` match the start and end of each line, not just the whole string.
multiline: bool,
) -> Table<String> {
if pattern.is_empty() {
return Table::new();
}
let flags = match (case_insensitive, multiline) {
(false, false) => "",
(true, false) => "(?i)",
(false, true) => "(?m)",
(true, true) => "(?im)",
};
let full_pattern = format!("{flags}{pattern}");
let Ok(regex) = fancy_regex::Regex::new(&full_pattern) else {
log::error!("Invalid regex pattern: {pattern}");
return Table::new();
};
regex
.find_iter(&string)
.filter_map(|m| m.ok())
.map(|m| {
TableRow::new_from_element(m.as_str().to_string())
.with_attribute(ATTR_START, m.start() as u64)
.with_attribute(ATTR_END, m.end() as u64)
})
.collect()
}
/// Splits a string into a list of substrings pulled from between separator characters as matched by a regular expression.
///
/// For example, splitting "Three, two, one... LIFTOFF" with pattern `\W+` (non-word characters) produces `["Three", "two", "one", "LIFTOFF"]`.
#[node_macro::node(category("Text: Regex"))]
fn regex_split(
_: impl Ctx,
/// The string to split into substrings.
string: String,
/// The regular expression pattern to split on. Matches are consumed and not included in the output.
pattern: String,
/// Match letters regardless of case.
case_insensitive: bool,
/// Make `^` and `$` match the start and end of each line, not just the whole string.
multiline: bool,
) -> Table<String> {
if pattern.is_empty() {
return Table::new_from_element(string);
}
let flags = match (case_insensitive, multiline) {
(false, false) => "",
(true, false) => "(?i)",
(false, true) => "(?m)",
(true, true) => "(?im)",
};
let full_pattern = format!("{flags}{pattern}");
let Ok(regex) = fancy_regex::Regex::new(&full_pattern) else {
log::error!("Invalid regex pattern: {pattern}");
return Table::new_from_element(string);
};
regex.split(&string).filter_map(|s| s.ok()).map(|s| s.to_string()).map(TableRow::new_from_element).collect()
}