talc/talc-lang/src/lstring.rs

825 lines
17 KiB
Rust

use std::{borrow::{Borrow, BorrowMut, Cow}, ffi::OsStr, fmt::{self, Write}, io, iter::{Copied, FusedIterator}, ops::{Add, AddAssign, Deref, DerefMut, Index, IndexMut}, rc::Rc, slice, str::Utf8Error, string::FromUtf8Error};
use unicode_ident::{is_xid_continue, is_xid_start};
#[macro_export]
macro_rules! lstr {
($s:literal) => {
$crate::lstring::LStr::from_str($s)
};
}
#[macro_export]
macro_rules! lformat {
($($t:tt)*) => {
$crate::lstring::LString::from(format!($($t)*))
};
}
//
// utility
//
#[inline]
fn is_continue(b: u8) -> bool {
b & 0xc0 == 0x80
}
#[inline]
fn calc_continue(mut ch: u32, bytes: &[u8]) -> Option<char> {
for b in bytes {
if !is_continue(*b) { return None }
ch = (ch << 6) | (b & 0x3f) as u32;
}
char::from_u32(ch)
}
#[inline]
fn next_codepoint(bytes: &[u8]) -> Option<(&[u8], Result<char, u8>)> {
let init = *bytes.first()?;
match init {
0..=0x7f => return Some((&bytes[1..], Ok(init as char))),
0xc0..=0xdf => 'case: {
if bytes.len() < 2 { break 'case }
let Some(ch) = calc_continue(init as u32 & 0x1f, &bytes[1..2]) else {
break 'case;
};
return Some((&bytes[2..], Ok(ch)))
},
0xe0..=0xef => 'case: {
if bytes.len() < 3 { break 'case }
let Some(ch) = calc_continue(init as u32 & 0x0f, &bytes[1..3]) else {
break 'case;
};
return Some((&bytes[3..], Ok(ch)))
},
0xf0..=0xf7 => 'case: {
if bytes.len() < 4 { break 'case }
let Some(ch) = calc_continue(init as u32 & 0x07, &bytes[1..4]) else {
break 'case;
};
return Some((&bytes[4..], Ok(ch)))
}
_ => (),
};
Some((&bytes[1..], Err(init)))
}
#[inline]
fn next_codepoint_back(bytes: &[u8]) -> Option<(&[u8], Result<char, u8>)> {
let len = bytes.len();
if len < 1 { return None }
let last = bytes[len-1];
if (0..=0x7f).contains(&last) {
return Some((&bytes[..len-1], Ok(last as char)))
}
'case: {
if !is_continue(last) { break 'case }
if len < 2 { break 'case }
let b1 = bytes[len-2];
if 0xe0 & b1 == 0xc0 {
if let Some(ch) = calc_continue(b1 as u32 & 0x1f, &[last]) {
return Some((&bytes[..len-2], Ok(ch)))
};
} else if !is_continue(b1) {
break 'case
}
if len < 3 { break 'case }
let b2 = bytes[len-3];
if 0xf0 & b2 == 0xe0 {
if let Some(ch) = calc_continue(b2 as u32 & 0x0f, &[b1, last]) {
return Some((&bytes[..len-3], Ok(ch)))
};
} else if !is_continue(b2) {
break 'case
}
if len < 4 { break 'case }
let b3 = bytes[len-4];
if 0xf8 & b3 == 0xf0 {
if let Some(ch) = calc_continue(b3 as u32 & 0x07, &[b2, b1, last]) {
return Some((&bytes[..len-4], Ok(ch)))
};
}
}
Some((&bytes[..len-1], Err(last)))
}
#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(transparent)]
pub struct LString {
inner: Vec<u8>,
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(transparent)]
pub struct LStr {
inner: [u8],
}
impl fmt::Debug for LStr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_char('"')?;
let mut bytes = &self.inner;
while let Some((new_bytes, res)) = next_codepoint(bytes) {
bytes = new_bytes;
match res {
Ok('"') => f.write_str("\\\"")?,
Ok('\\') => f.write_str("\\\\")?,
Ok('\x00') => f.write_str("\\0")?,
Ok('\x07') => f.write_str("\\a")?,
Ok('\x08') => f.write_str("\\b")?,
Ok('\x09') => f.write_str("\\t")?,
Ok('\x0a') => f.write_str("\\n")?,
Ok('\x0b') => f.write_str("\\v")?,
Ok('\x0c') => f.write_str("\\f")?,
Ok('\x0d') => f.write_str("\\r")?,
Ok('\x1b') => f.write_str("\\e")?,
Ok(c) if c.is_control() => write!(f, "\\u{{{:x}}}", c as u32)?,
Ok(c) => f.write_char(c)?,
Err(b) => write!(f, "\\x{b:02x}")?,
}
}
f.write_char('"')
}
}
impl fmt::Display for LStr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut bytes = &self.inner;
while let Some((new_bytes, res)) = next_codepoint(bytes) {
bytes = new_bytes;
if let Ok(c) = res {
f.write_char(c)?;
}
}
Ok(())
}
}
impl fmt::Debug for LString {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(self.as_ref(), f)
}
}
impl fmt::Display for LString {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self.as_ref(), f)
}
}
//
// deref, asref, borrow
//
impl Deref for LString {
type Target = LStr;
#[inline]
fn deref(&self) -> &Self::Target {
<&LStr as From<&[u8]>>::from(self.inner.as_ref())
}
}
impl DerefMut for LString {
#[inline]
fn deref_mut(&mut self) -> &mut Self::Target {
<&mut LStr as From<&mut [u8]>>::from(self.inner.as_mut())
}
}
impl AsRef<LStr> for LString {
#[inline]
fn as_ref(&self) -> &LStr {
<&LStr as From<&[u8]>>::from(self.inner.as_ref())
}
}
impl AsMut<LStr> for LString {
#[inline]
fn as_mut(&mut self) -> &mut LStr {
<&mut LStr as From<&mut [u8]>>::from(self.inner.as_mut())
}
}
impl Borrow<LStr> for LString {
#[inline]
fn borrow(&self) -> &LStr {
<&LStr as From<&[u8]>>::from(self.inner.as_ref())
}
}
impl BorrowMut<LStr> for LString {
#[inline]
fn borrow_mut(&mut self) -> &mut LStr {
<&mut LStr as From<&mut [u8]>>::from(self.inner.as_mut())
}
}
//
// conversions
//
impl From<LString> for Vec<u8> {
#[inline]
fn from(value: LString) -> Self { value.inner }
}
impl From<Vec<u8>> for LString {
#[inline]
fn from(value: Vec<u8>) -> Self { Self { inner: value } }
}
impl From<String> for LString {
#[inline]
fn from(value: String) -> Self { Self { inner: value.into_bytes() } }
}
impl From<&LStr> for LString {
#[inline]
fn from(value: &LStr) -> Self { value.to_owned() }
}
impl From<&str> for LString {
#[inline]
fn from(value: &str) -> Self { value.to_owned().into() }
}
impl From<&[u8]> for LString {
#[inline]
fn from(value: &[u8]) -> Self { value.to_owned().into() }
}
impl From<Cow<'_, LStr>> for LString {
#[inline]
fn from(value: Cow<'_, LStr>) -> Self {
match value {
Cow::Borrowed(b) => b.to_owned(),
Cow::Owned(o) => o
}
}
}
impl From<Cow<'_, str>> for LString {
#[inline]
fn from(value: Cow<'_, str>) -> Self {
value.into_owned().into()
}
}
impl From<Cow<'_, [u8]>> for LString {
#[inline]
fn from(value: Cow<'_, [u8]>) -> Self {
value.into_owned().into()
}
}
impl<'a> From<&'a LStr> for &'a [u8] {
#[inline]
fn from(value: &'a LStr) -> Self { &value.inner }
}
impl<'a> From<&'a [u8]> for &'a LStr {
#[inline]
fn from(value: &'a [u8]) -> Self {
LStr::from_bytes(value)
}
}
impl<'a> From<&'a str> for &'a LStr {
#[inline]
fn from(value: &'a str) -> Self {
LStr::from_str(value)
}
}
impl<'a> From<&'a mut LStr> for &'a mut [u8] {
#[inline]
fn from(value: &'a mut LStr) -> Self { &mut value.inner }
}
impl<'a> From<&'a mut [u8]> for &'a mut LStr {
#[inline]
fn from(value: &'a mut [u8]) -> Self {
LStr::from_bytes_mut(value)
}
}
impl<'a> From<&'a LString> for &'a LStr {
#[inline]
fn from(value: &'a LString) -> Self { value }
}
impl From<&LStr> for Rc<LStr> {
#[inline]
fn from(v: &LStr) -> Rc<LStr> {
let arc = Rc::<[u8]>::from(v.as_bytes());
unsafe { Rc::from_raw(Rc::into_raw(arc) as *const LStr) }
}
}
impl From<LString> for Rc<LStr> {
#[inline]
fn from(v: LString) -> Rc<LStr> {
Rc::from(&v[..])
}
}
impl ToOwned for LStr {
type Owned = LString;
#[inline]
fn to_owned(&self) -> Self::Owned {
self.as_bytes().to_owned().into()
}
}
impl TryFrom<LString> for String {
type Error = FromUtf8Error;
#[inline]
fn try_from(value: LString) -> Result<Self, Self::Error> {
String::from_utf8(value.into())
}
}
impl<'a> TryFrom<&'a LStr> for &'a str {
type Error = Utf8Error;
#[inline]
fn try_from(value: &'a LStr) -> Result<Self, Self::Error> {
std::str::from_utf8(&value.inner)
}
}
impl From<char> for LString {
#[inline]
fn from(value: char) -> Self {
value.to_string().into()
}
}
impl From<u8> for LString {
#[inline]
fn from(value: u8) -> Self {
vec![value].into()
}
}
impl FromIterator<char> for LString {
#[inline]
fn from_iter<I: IntoIterator<Item=char>>(iter: I) -> Self {
String::from_iter(iter).into()
}
}
impl FromIterator<u8> for LString {
#[inline]
fn from_iter<T: IntoIterator<Item=u8>>(iter: T) -> Self {
Vec::from_iter(iter).into()
}
}
impl Extend<u8> for LString {
#[inline]
fn extend<I: IntoIterator<Item=u8>>(&mut self, iter: I) {
self.inner.extend(iter);
}
}
impl<'a> Extend<&'a u8> for LString {
#[inline]
fn extend<I: IntoIterator<Item=&'a u8>>(&mut self, iter: I) {
self.inner.extend(iter);
}
}
impl Extend<char> for LString {
#[inline]
fn extend<I: IntoIterator<Item=char>>(&mut self, iter: I) {
let iter = iter.into_iter();
let (lo, _) = iter.size_hint();
self.reserve(lo);
iter.for_each(move |ch| self.push_char(ch));
}
}
impl<'a> Extend<&'a char> for LString {
#[inline]
fn extend<I: IntoIterator<Item=&'a char>>(&mut self, iter: I) {
let iter = iter.into_iter();
let (lo, _) = iter.size_hint();
self.reserve(lo);
iter.for_each(move |ch| self.push_char(*ch));
}
}
//
// write
//
impl io::Write for LString {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.extend(buf);
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> { Ok(()) }
}
//impl fmt::Write for LString {
// fn write_str(&mut self, s: &str) -> fmt::Result {
// self.extend(s.as_bytes());
// Ok(())
// }
//}
//
// methods
//
impl LString {
#[inline]
pub const fn new() -> Self {
Self { inner: Vec::new() }
}
#[inline]
pub fn with_capacity(capacity: usize) -> Self {
Vec::with_capacity(capacity).into()
}
#[inline]
pub fn reserve(&mut self, additional: usize) {
self.inner.reserve(additional);
}
#[inline]
pub fn push_byte(&mut self, byte: u8) {
self.inner.push(byte);
}
#[inline]
pub fn push_bytes(&mut self, bytes: &[u8]) {
self.inner.extend_from_slice(bytes);
}
#[inline]
pub fn push_char(&mut self, ch: char) {
let mut buf = [0; 5];
self.push_bytes(ch.encode_utf8(&mut buf).as_bytes());
}
#[inline]
pub fn push_lstr(&mut self, lstring: &LStr) {
self.push_bytes(lstring.as_bytes());
}
#[inline]
pub fn push_str(&mut self, string: &str) {
self.push_bytes(string.as_bytes());
}
#[inline]
pub fn clear(&mut self) {
self.inner.clear();
}
#[inline]
pub fn leak<'a>(self) -> &'a mut LStr {
self.inner.leak().into()
}
#[inline]
pub fn into_string(self) -> Result<String, FromUtf8Error> {
String::from_utf8(self.inner)
}
}
#[derive(Clone)]
pub struct Bytes<'a>(Copied<slice::Iter<'a, u8>>);
impl<'a> Iterator for Bytes<'a> {
type Item = u8;
#[inline]
fn next(&mut self) -> Option<Self::Item> { self.0.next() }
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) { self.0.size_hint() }
#[inline]
fn count(self) -> usize { self.0.count() }
#[inline]
fn last(self) -> Option<Self::Item> { self.0.last() }
#[inline]
fn nth(&mut self, n: usize) -> Option<Self::Item> { self.0.nth(n) }
#[inline]
fn all<F: FnMut(Self::Item) -> bool>(&mut self, f: F) -> bool {
self.0.all(f)
}
#[inline]
fn any<F: FnMut(Self::Item) -> bool>(&mut self, f: F) -> bool {
self.0.any(f)
}
#[inline]
fn find<P: FnMut(&Self::Item) -> bool>(&mut self, predicate: P) -> Option<Self::Item> {
self.0.find(predicate)
}
#[inline]
fn position<P: FnMut(Self::Item) -> bool>(&mut self, predicate: P) -> Option<usize> {
self.0.position(predicate)
}
}
impl<'a> ExactSizeIterator for Bytes<'a> {
#[inline]
fn len(&self) -> usize { self.0.len() }
}
impl<'a> FusedIterator for Bytes<'a> {}
#[derive(Clone)]
pub struct LosslessChars<'a>(&'a [u8]);
impl<'a> Iterator for LosslessChars<'a> {
type Item = Result<char, u8>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let (new_bytes, res) = next_codepoint(self.0)?;
self.0 = new_bytes;
Some(res)
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let len = self.0.len();
((len + 3)/4, Some(len))
}
}
impl<'a> DoubleEndedIterator for LosslessChars<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
let (new_bytes, res) = next_codepoint_back(self.0)?;
self.0 = new_bytes;
Some(res)
}
}
#[derive(Clone)]
pub struct Chars<'a>(LosslessChars<'a>);
impl<'a> Iterator for Chars<'a> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Ok(c) = self.0.next()? {
return Some(c)
}
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.0.size_hint()
}
}
impl<'a> DoubleEndedIterator for Chars<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
loop {
if let Ok(c) = self.0.next_back()? {
return Some(c)
}
}
}
}
impl LStr {
#[inline]
pub const fn from_str(string: &str) -> &Self {
Self::from_bytes(string.as_bytes())
}
#[inline]
pub const fn from_bytes(bytes: &[u8]) -> &Self {
unsafe { &*(bytes as *const [u8] as *const LStr) }
}
#[inline]
pub fn from_bytes_mut(bytes: &mut [u8]) -> &mut Self {
unsafe { &mut *(bytes as *mut [u8] as *mut LStr) }
}
#[inline]
pub const fn as_bytes(&self) -> &[u8] { &self.inner }
#[inline]
pub fn as_bytes_mut(&mut self) -> &mut [u8] { &mut self.inner }
#[inline]
pub fn bytes(&self) -> Bytes {
Bytes(self.as_bytes().iter().copied())
}
#[inline]
pub fn chars(&self) -> Chars {
Chars(self.chars_lossless())
}
#[inline]
pub fn chars_lossless(&self) -> LosslessChars {
LosslessChars(self.as_bytes())
}
#[inline]
pub const fn to_str(&self) -> Result<&str, Utf8Error> {
std::str::from_utf8(&self.inner)
}
#[inline]
pub const fn len(&self) -> usize {
self.inner.len()
}
#[inline]
pub const fn is_empty(&self) -> bool {
self.inner.is_empty()
}
#[inline]
pub const fn is_utf8(&self) -> bool {
self.to_str().is_ok()
}
#[inline]
pub fn to_utf8_lossy(&self) -> Cow<'_, LStr> {
match String::from_utf8_lossy(self.as_bytes()) {
Cow::Borrowed(b) => Cow::Borrowed(b.into()),
Cow::Owned(o) => Cow::Owned(o.into()),
}
}
#[inline]
pub fn to_os_str(&self) -> Cow<OsStr> {
#[cfg(unix)] {
use std::os::unix::ffi::OsStrExt;
Cow::Borrowed(OsStr::from_bytes(self.as_bytes()))
}
#[cfg(not(unix))] {
Cow::Owned(self.to_string().into())
}
}
fn convert_while_ascii(&self, f: impl Fn(&u8) -> u8) -> LString {
let mut out = LString::new();
for b in self.bytes() {
if !(0..=0x7f).contains(&b) {
break
}
out.push_byte(f(&b));
}
out
}
pub fn to_uppercase(&self) -> LString {
let mut out = self.convert_while_ascii(u8::to_ascii_uppercase);
for ch in self[out.len()..].chars_lossless() {
match ch {
Ok(c) => out.extend(c.to_uppercase()),
Err(b) => out.push_byte(b),
}
}
out
}
pub fn to_lowercase(&self) -> LString {
let mut out = self.convert_while_ascii(u8::to_ascii_lowercase);
for ch in self[out.len()..].chars_lossless() {
match ch {
Ok(c) => out.extend(c.to_lowercase()),
Err(b) => out.push_byte(b),
}
}
out
}
pub fn trim(&self) -> &LStr {
self.trim_by(char::is_whitespace)
}
pub fn trim_by(&self, pattern: impl Fn(char) -> bool) -> &LStr {
let mut start = 0;
for ch in self.chars_lossless() {
if !ch.is_ok_and(&pattern) {
break
}
start += ch.map_or_else(|_| 1, char::len_utf8);
}
if start == self.len() {
return &self[0..0]
}
let mut end = self.len();
for ch in self.chars_lossless().rev() {
if !ch.is_ok_and(&pattern) {
break
}
end -= ch.map_or_else(|_| 1, char::len_utf8);
}
&self[start..end]
}
pub fn starts_with(&self, s: &LStr) -> bool {
self.as_bytes().starts_with(s.as_bytes())
}
pub fn ends_with(&self, s: &LStr) -> bool {
self.as_bytes().ends_with(s.as_bytes())
}
pub fn is_identifier(&self) -> bool {
let mut chars = self.chars_lossless();
let first = chars.next()
.is_some_and(|ch| ch.is_ok_and(is_xid_start));
if !first {
return false
}
chars.all(|ch| ch.is_ok_and(is_xid_continue))
}
}
impl Add<&LStr> for LString {
type Output = Self;
#[inline]
fn add(mut self, rhs: &LStr) -> Self::Output {
self.push_lstr(rhs);
self
}
}
impl AddAssign<&LStr> for LString {
#[inline]
fn add_assign(&mut self, rhs: &LStr) {
self.push_lstr(rhs);
}
}
impl Default for &LStr {
fn default() -> Self { [].as_ref().into() }
}
impl Default for &mut LStr {
fn default() -> Self { [].as_mut().into() }
}
macro_rules! impl_index {
($ty:ty) => {
impl Index<$ty> for LStr {
type Output = LStr;
fn index(&self, index: $ty) -> &Self::Output {
self.inner.index(index).into()
}
}
impl IndexMut<$ty> for LStr {
fn index_mut(&mut self, index: $ty) -> &mut LStr {
self.inner.index_mut(index).into()
}
}
impl Index<$ty> for LString {
type Output = LStr;
fn index(&self, index: $ty) -> &Self::Output {
self.inner.index(index).into()
}
}
impl IndexMut<$ty> for LString {
fn index_mut(&mut self, index: $ty) -> &mut LStr {
self.inner.index_mut(index).into()
}
}
};
}
impl_index!(std::ops::Range<usize>);
impl_index!(std::ops::RangeFrom<usize>);
impl_index!(std::ops::RangeFull);
impl_index!(std::ops::RangeInclusive<usize>);
impl_index!(std::ops::RangeTo<usize>);
impl_index!(std::ops::RangeToInclusive<usize>);
impl_index!((std::ops::Bound<usize>, std::ops::Bound<usize>));