use crate::markdown::{MdStream, MdTree};
use std::{iter, mem, str};
const ANC_E: &[u8] = b">";
const ANC_S: &[u8] = b"<";
const BRK: &[u8] = b"---";
const CBK: &[u8] = b"```";
const CIL: &[u8] = b"`";
const CMT_E: &[u8] = b"-->";
const CMT_S: &[u8] = b"<!--";
const EMP: &[u8] = b"_";
const HDG: &[u8] = b"#";
const LNK_CHARS: &str = "$-_.+!*'()/&?=:%";
const LNK_E: &[u8] = b"]";
const LNK_S: &[u8] = b"[";
const STG: &[u8] = b"**";
const STK: &[u8] = b"~~";
const UL1: &[u8] = b"* ";
const UL2: &[u8] = b"- ";
const REPLACEMENTS: &[(&str, &str)] = &[
("(c)", "©"),
("(C)", "©"),
("(r)", "®"),
("(R)", "®"),
("(tm)", "™"),
("(TM)", "™"),
(":crab:", "🦀"),
("\n", " "),
];
type Parsed<'a> = (MdTree<'a>, &'a [u8]);
type ParseResult<'a> = Option<Parsed<'a>>;
#[derive(Clone, Copy, Debug, PartialEq)]
struct Context {
top_block: bool,
prev: Prev,
}
#[derive(Clone, Copy, Debug, PartialEq)]
enum Prev {
Newline,
Whitespace,
Escape,
Any,
}
impl Default for Context {
fn default() -> Self {
Self { top_block: false, prev: Prev::Whitespace }
}
}
#[derive(Clone, Copy, Debug, PartialEq)]
enum ParseOpt {
TrimNoEsc,
None,
}
pub fn entrypoint(txt: &str) -> MdStream<'_> {
let ctx = Context { top_block: true, prev: Prev::Newline };
normalize(parse_recursive(txt.trim().as_bytes(), ctx), &mut Vec::new())
}
fn parse_recursive<'a>(buf: &'a [u8], ctx: Context) -> MdStream<'_> {
use ParseOpt as Po;
use Prev::{Escape, Newline, Whitespace};
let mut stream: Vec<MdTree<'a>> = Vec::new();
let Context { top_block: top_blk, mut prev } = ctx;
let mut wip_buf = buf;
let mut loop_buf = wip_buf;
while !loop_buf.is_empty() {
let next_prev = match loop_buf[0] {
b'\n' => Newline,
b'\\' => Escape,
x if x.is_ascii_whitespace() => Whitespace,
_ => Prev::Any,
};
let res: ParseResult<'_> = match (top_blk, prev) {
(_, Newline | Whitespace) if loop_buf.starts_with(CMT_S) => {
parse_simple_pat(loop_buf, CMT_S, CMT_E, Po::TrimNoEsc, MdTree::Comment)
}
(true, Newline) if loop_buf.starts_with(CBK) => Some(parse_codeblock(loop_buf)),
(_, Newline | Whitespace) if loop_buf.starts_with(CIL) => parse_codeinline(loop_buf),
(true, Newline | Whitespace) if loop_buf.starts_with(HDG) => parse_heading(loop_buf),
(true, Newline) if loop_buf.starts_with(BRK) => {
Some((MdTree::HorizontalRule, parse_to_newline(loop_buf).1))
}
(_, Newline | Whitespace) if loop_buf.starts_with(EMP) => {
parse_simple_pat(loop_buf, EMP, EMP, Po::None, MdTree::Emphasis)
}
(_, Newline | Whitespace) if loop_buf.starts_with(STG) => {
parse_simple_pat(loop_buf, STG, STG, Po::None, MdTree::Strong)
}
(_, Newline | Whitespace) if loop_buf.starts_with(STK) => {
parse_simple_pat(loop_buf, STK, STK, Po::None, MdTree::Strikethrough)
}
(_, Newline | Whitespace) if loop_buf.starts_with(ANC_S) => {
let tt_fn = |link| MdTree::Link { disp: link, link };
let ret = parse_simple_pat(loop_buf, ANC_S, ANC_E, Po::None, tt_fn);
match ret {
Some((MdTree::Link { disp, .. }, _))
if disp.chars().all(|ch| LNK_CHARS.contains(ch)) =>
{
ret
}
_ => None,
}
}
(_, Newline) if (loop_buf.starts_with(UL1) || loop_buf.starts_with(UL2)) => {
Some(parse_unordered_li(loop_buf))
}
(_, Newline) if ord_list_start(loop_buf).is_some() => Some(parse_ordered_li(loop_buf)),
(_, Newline | Whitespace) if loop_buf.starts_with(LNK_S) => {
parse_any_link(loop_buf, top_blk && prev == Prev::Newline)
}
(_, Escape | _) => None,
};
if let Some((tree, rest)) = res {
let prev_buf = &wip_buf[..(wip_buf.len() - loop_buf.len())];
if !prev_buf.is_empty() {
let prev_str = str::from_utf8(prev_buf).unwrap();
stream.push(MdTree::PlainText(prev_str));
}
stream.push(tree);
wip_buf = rest;
loop_buf = rest;
} else {
loop_buf = &loop_buf[1..];
if loop_buf.is_empty() && !wip_buf.is_empty() {
let final_str = str::from_utf8(wip_buf).unwrap();
stream.push(MdTree::PlainText(final_str));
}
};
prev = next_prev;
}
MdStream(stream)
}
fn parse_simple_pat<'a, F>(
buf: &'a [u8],
start_pat: &[u8],
end_pat: &[u8],
opts: ParseOpt,
create_tt: F,
) -> ParseResult<'a>
where
F: FnOnce(&'a str) -> MdTree<'a>,
{
let ignore_esc = matches!(opts, ParseOpt::TrimNoEsc);
let trim = matches!(opts, ParseOpt::TrimNoEsc);
let (txt, rest) = parse_with_end_pat(&buf[start_pat.len()..], end_pat, ignore_esc)?;
let mut txt = str::from_utf8(txt).unwrap();
if trim {
txt = txt.trim();
}
Some((create_tt(txt), rest))
}
fn parse_codeinline(buf: &[u8]) -> ParseResult<'_> {
let seps = buf.iter().take_while(|ch| **ch == b'`').count();
let (txt, rest) = parse_with_end_pat(&buf[seps..], &buf[..seps], true)?;
Some((MdTree::CodeInline(str::from_utf8(txt).unwrap()), rest))
}
fn parse_codeblock(buf: &[u8]) -> Parsed<'_> {
let seps = buf.iter().take_while(|ch| **ch == b'`').count();
let end_sep = &buf[..seps];
let mut working = &buf[seps..];
let next_ws_idx = working.iter().take_while(|ch| !ch.is_ascii_whitespace()).count();
let lang = if next_ws_idx > 0 {
let tmp = str::from_utf8(&working[..next_ws_idx]).unwrap();
working = &working[next_ws_idx..];
Some(tmp)
} else {
None
};
let mut end_pat = vec![b'\n'];
end_pat.extend(end_sep);
let mut found = None;
for idx in (0..working.len()).filter(|idx| working[*idx..].starts_with(&end_pat)) {
let (eol_txt, rest) = parse_to_newline(&working[(idx + end_pat.len())..]);
if !eol_txt.iter().any(u8::is_ascii_whitespace) {
found = Some((&working[..idx], rest));
break;
}
}
let (txt, rest) = found.unwrap_or((working, &[]));
let txt = str::from_utf8(txt).unwrap().trim_matches('\n');
(MdTree::CodeBlock { txt, lang }, rest)
}
fn parse_heading(buf: &[u8]) -> ParseResult<'_> {
let level = buf.iter().take_while(|ch| **ch == b'#').count();
let buf = &buf[level..];
if level > 6 || (buf.len() > 1 && !buf[0].is_ascii_whitespace()) {
return None;
}
let (txt, rest) = parse_to_newline(&buf[1..]);
let ctx = Context { top_block: false, prev: Prev::Whitespace };
let stream = parse_recursive(txt, ctx);
Some((MdTree::Heading(level.try_into().unwrap(), stream), rest))
}
fn parse_unordered_li(buf: &[u8]) -> Parsed<'_> {
debug_assert!(buf.starts_with(b"* ") || buf.starts_with(b"- "));
let (txt, rest) = get_indented_section(&buf[2..]);
let ctx = Context { top_block: false, prev: Prev::Whitespace };
let stream = parse_recursive(trim_ascii_start(txt), ctx);
(MdTree::UnorderedListItem(stream), rest)
}
fn parse_ordered_li(buf: &[u8]) -> Parsed<'_> {
let (num, pos) = ord_list_start(buf).unwrap(); let (txt, rest) = get_indented_section(&buf[pos..]);
let ctx = Context { top_block: false, prev: Prev::Whitespace };
let stream = parse_recursive(trim_ascii_start(txt), ctx);
(MdTree::OrderedListItem(num, stream), rest)
}
fn get_indented_section(buf: &[u8]) -> (&[u8], &[u8]) {
let mut end = buf.len();
for (idx, window) in buf.windows(2).enumerate() {
let &[ch, next_ch] = window else { unreachable!("always 2 elements") };
if idx >= buf.len().saturating_sub(2) && next_ch == b'\n' {
end = buf.len().saturating_sub(1);
break;
} else if ch == b'\n' && (!next_ch.is_ascii_whitespace() || next_ch == b'\n') {
end = idx;
break;
}
}
(&buf[..end], &buf[end..])
}
fn ord_list_start(buf: &[u8]) -> Option<(u16, usize)> {
let pos = buf.iter().take(10).position(|ch| *ch == b'.')?;
let n = str::from_utf8(&buf[..pos]).ok()?;
if !buf.get(pos + 1)?.is_ascii_whitespace() {
return None;
}
n.parse::<u16>().ok().map(|v| (v, pos + 2))
}
fn parse_any_link(buf: &[u8], can_be_def: bool) -> ParseResult<'_> {
let (bracketed, rest) = parse_with_end_pat(&buf[1..], LNK_E, true)?;
if rest.is_empty() {
return None;
}
let disp = str::from_utf8(bracketed).unwrap();
match (can_be_def, rest[0]) {
(true, b':') => {
let (link, tmp) = parse_to_newline(&rest[1..]);
let link = str::from_utf8(link).unwrap().trim();
Some((MdTree::LinkDef { id: disp, link }, tmp))
}
(_, b'(') => parse_simple_pat(rest, b"(", b")", ParseOpt::TrimNoEsc, |link| MdTree::Link {
disp,
link,
}),
(_, b'[') => parse_simple_pat(rest, b"[", b"]", ParseOpt::TrimNoEsc, |id| {
MdTree::RefLink { disp, id: Some(id) }
}),
_ => Some((MdTree::RefLink { disp, id: None }, rest)),
}
}
fn parse_with_end_pat<'a>(
buf: &'a [u8],
end_sep: &[u8],
ignore_esc: bool,
) -> Option<(&'a [u8], &'a [u8])> {
for idx in (0..buf.len()).filter(|idx| buf[*idx..].starts_with(end_sep)) {
if !ignore_esc && idx > 0 && buf[idx - 1] == b'\\' {
continue;
}
return Some((&buf[..idx], &buf[idx + end_sep.len()..]));
}
None
}
fn parse_to_newline(buf: &[u8]) -> (&[u8], &[u8]) {
buf.iter().position(|ch| *ch == b'\n').map_or((buf, &[]), |pos| buf.split_at(pos))
}
fn normalize<'a>(MdStream(stream): MdStream<'a>, linkdefs: &mut Vec<MdTree<'a>>) -> MdStream<'a> {
let mut new_stream = Vec::with_capacity(stream.len());
let new_defs = stream.iter().filter(|tt| matches!(tt, MdTree::LinkDef { .. }));
linkdefs.extend(new_defs.cloned());
for item in stream {
match item {
MdTree::PlainText(txt) => expand_plaintext(txt, &mut new_stream, MdTree::PlainText),
MdTree::Strong(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Strong),
MdTree::Emphasis(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Emphasis),
MdTree::Strikethrough(txt) => {
expand_plaintext(txt, &mut new_stream, MdTree::Strikethrough);
}
MdTree::RefLink { disp, id } => new_stream.push(match_reflink(linkdefs, disp, id)),
MdTree::OrderedListItem(n, st) => {
new_stream.push(MdTree::OrderedListItem(n, normalize(st, linkdefs)));
}
MdTree::UnorderedListItem(st) => {
new_stream.push(MdTree::UnorderedListItem(normalize(st, linkdefs)));
}
MdTree::Heading(n, st) => new_stream.push(MdTree::Heading(n, normalize(st, linkdefs))),
_ => new_stream.push(item),
}
}
new_stream.retain(|x| !matches!(x, MdTree::Comment(_) | MdTree::LinkDef { .. }));
new_stream.dedup_by(|r, l| matches!((r, l), (MdTree::ParagraphBreak, MdTree::ParagraphBreak)));
if new_stream.first().is_some_and(is_break_ty) {
new_stream.remove(0);
}
if new_stream.last().is_some_and(is_break_ty) {
new_stream.pop();
}
let to_keep: Vec<bool> = new_stream
.windows(3)
.map(|w| {
!((matches!(&w[1], MdTree::ParagraphBreak)
&& matches!(should_break(&w[0], &w[2]), BreakRule::Always(1) | BreakRule::Never))
|| (matches!(&w[1], MdTree::PlainText(txt) if txt.trim().is_empty())
&& matches!(
should_break(&w[0], &w[2]),
BreakRule::Always(_) | BreakRule::Never
)))
})
.collect();
let mut iter = iter::once(true).chain(to_keep).chain(iter::once(true));
new_stream.retain(|_| iter.next().unwrap());
let mut insertions = 0;
let to_insert: Vec<(usize, MdTree<'_>)> = new_stream
.windows(2)
.enumerate()
.filter_map(|(idx, w)| match should_break(&w[0], &w[1]) {
BreakRule::Always(1) => Some((idx, MdTree::LineBreak)),
BreakRule::Always(2) => Some((idx, MdTree::ParagraphBreak)),
_ => None,
})
.map(|(idx, tt)| {
insertions += 1;
(idx + insertions, tt)
})
.collect();
to_insert.into_iter().for_each(|(idx, tt)| new_stream.insert(idx, tt));
MdStream(new_stream)
}
#[derive(Clone, Copy, Debug, PartialEq)]
enum BreakRule {
Always(u8),
Never,
Optional,
}
fn should_break(left: &MdTree<'_>, right: &MdTree<'_>) -> BreakRule {
use MdTree::*;
match (left, right) {
(HorizontalRule, _)
| (_, HorizontalRule)
| (OrderedListItem(_, _), OrderedListItem(_, _))
| (UnorderedListItem(_), UnorderedListItem(_)) => BreakRule::Always(1),
(Comment(_) | ParagraphBreak | Heading(_, _), _) | (_, Comment(_) | ParagraphBreak) => {
BreakRule::Never
}
(CodeBlock { .. } | OrderedListItem(_, _) | UnorderedListItem(_), _)
| (_, CodeBlock { .. } | Heading(_, _) | OrderedListItem(_, _) | UnorderedListItem(_)) => {
BreakRule::Always(2)
}
(
CodeInline(_)
| Strong(_)
| Emphasis(_)
| Strikethrough(_)
| PlainText(_)
| Link { .. }
| RefLink { .. }
| LinkDef { .. },
CodeInline(_)
| Strong(_)
| Emphasis(_)
| Strikethrough(_)
| PlainText(_)
| Link { .. }
| RefLink { .. }
| LinkDef { .. },
) => BreakRule::Optional,
(LineBreak, _) | (_, LineBreak) => {
unreachable!("should have been removed during deduplication")
}
}
}
fn is_break_ty(val: &MdTree<'_>) -> bool {
matches!(val, MdTree::ParagraphBreak | MdTree::LineBreak)
|| matches!(val, MdTree::PlainText(txt) if txt.trim().is_empty())
}
fn expand_plaintext<'a>(
txt: &'a str,
stream: &mut Vec<MdTree<'a>>,
mut f: fn(&'a str) -> MdTree<'a>,
) {
if txt.is_empty() {
return;
} else if txt == "\n" {
if let Some(tt) = stream.last() {
let tmp = MdTree::PlainText(" ");
if should_break(tt, &tmp) == BreakRule::Optional {
stream.push(tmp);
}
}
return;
}
let mut queue1 = Vec::new();
let mut queue2 = Vec::new();
let stream_start_len = stream.len();
for paragraph in txt.split("\n\n") {
if paragraph.is_empty() {
stream.push(MdTree::ParagraphBreak);
continue;
}
let paragraph = trim_extra_ws(paragraph);
queue1.clear();
queue1.push(paragraph);
for (from, to) in REPLACEMENTS {
queue2.clear();
for item in &queue1 {
for s in item.split(from) {
queue2.extend(&[s, to]);
}
if queue2.len() > 1 {
let _ = queue2.pop(); }
}
mem::swap(&mut queue1, &mut queue2);
}
queue1.retain(|s| !s.is_empty());
for idx in 0..queue1.len() {
queue1[idx] = trim_extra_ws(queue1[idx]);
if idx < queue1.len() - 1
&& queue1[idx].ends_with(char::is_whitespace)
&& queue1[idx + 1].starts_with(char::is_whitespace)
{
queue1[idx] = queue1[idx].trim_end();
}
}
stream.extend(queue1.iter().copied().filter(|txt| !txt.is_empty()).map(&mut f));
stream.push(MdTree::ParagraphBreak);
}
if stream.len() - stream_start_len > 1 {
let _ = stream.pop(); }
}
fn match_reflink<'a>(linkdefs: &[MdTree<'a>], disp: &'a str, match_id: Option<&str>) -> MdTree<'a> {
let to_match = match_id.unwrap_or(disp); for def in linkdefs {
if let MdTree::LinkDef { id, link } = def {
if *id == to_match {
return MdTree::Link { disp, link };
}
}
}
MdTree::Link { disp, link: "" } }
fn trim_extra_ws(mut txt: &str) -> &str {
let start_ws =
txt.bytes().position(|ch| !ch.is_ascii_whitespace()).unwrap_or(txt.len()).saturating_sub(1);
txt = &txt[start_ws..];
let end_ws = txt
.bytes()
.rev()
.position(|ch| !ch.is_ascii_whitespace())
.unwrap_or(txt.len())
.saturating_sub(1);
&txt[..txt.len() - end_ws]
}
fn trim_ascii_start(buf: &[u8]) -> &[u8] {
let count = buf.iter().take_while(|ch| ch.is_ascii_whitespace()).count();
&buf[count..]
}
#[cfg(test)]
#[path = "tests/parse.rs"]
mod tests;